diff --git a/.circleci/config.yml b/.circleci/config.yml index ea93575ac9430..6f134c9a7a7bd 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -72,10 +72,6 @@ jobs: no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | pip3 install cibuildwheel==2.15.0 - # When this is a nightly wheel build, allow picking up NumPy 2.0 dev wheels: - if [[ "$IS_SCHEDULE_DISPATCH" == "true" || "$IS_PUSH" != 'true' ]]; then - export CIBW_ENVIRONMENT="PIP_EXTRA_INDEX_URL=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - fi cibuildwheel --prerelease-pythons --output-dir wheelhouse environment: diff --git a/.github/ISSUE_TEMPLATE/pdep_vote.yaml b/.github/ISSUE_TEMPLATE/pdep_vote.yaml new file mode 100644 index 0000000000000..6dcbd76eb0f74 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/pdep_vote.yaml @@ -0,0 +1,74 @@ +name: PDEP Vote +description: Call for a vote on a PDEP +title: "VOTE: " +labels: [Vote] + +body: + - type: markdown + attributes: + value: > + As per [PDEP-1](https://pandas.pydata.org/pdeps/0001-purpose-and-guidelines.html), the following issue template should be used when a + maintainer has opened a PDEP discussion and is ready to call for a vote. + - type: checkboxes + attributes: + label: Locked issue + options: + - label: > + I locked this voting issue so that only voting members are able to cast their votes or + comment on this issue. + required: true + - type: input + id: PDEP-name + attributes: + label: PDEP number and title + placeholder: > + PDEP-1: Purpose and guidelines + validations: + required: true + - type: input + id: PDEP-link + attributes: + label: Pull request with discussion + description: e.g. https://github.com/pandas-dev/pandas/pull/47444 + validations: + required: true + - type: input + id: PDEP-rendered-link + attributes: + label: Rendered PDEP for easy reading + description: e.g. https://github.com/pandas-dev/pandas/pull/47444/files?short_path=7c449e6#diff-7c449e698132205b235c501f7e47ebba38da4d2b7f9492c98f16745dba787041 + validations: + required: true + - type: input + id: PDEP-number-of-discussion-participants + attributes: + label: Discussion participants + description: > + You may find it useful to list or total the number of participating members in the + PDEP discussion PR. This would be the maximum possible disapprove votes. + placeholder: > + 14 voting members participated in the PR discussion thus far. + - type: input + id: PDEP-vote-end + attributes: + label: Voting will close in 15 days. + description: The voting period end date. ('Voting will close in 15 days.' will be automatically written) + - type: markdown + attributes: + value: --- + - type: textarea + id: Vote + attributes: + label: Vote + value: | + Cast your vote in a comment below. + * +1: approve. + * 0: abstain. + * Reason: A one sentence reason is required. + * -1: disapprove + * Reason: A one sentence reason is required. + A disapprove vote requires prior participation in the linked discussion PR. + + @pandas-dev/pandas-core + validations: + required: true diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 24b2de251ce8e..937af7e49c6d3 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -85,7 +85,7 @@ jobs: echo "PYTHONPATH=$PYTHONPATH" >> $GITHUB_ENV if: ${{ steps.build.outcome == 'success' && always() }} - - name: Typing + pylint + - name: Typing uses: pre-commit/action@v3.0.1 with: extra_args: --verbose --hook-stage manual --all-files diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 470c044d2e99e..4bd9068e91b67 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -139,8 +139,7 @@ jobs: shell: bash -el {0} run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - - name: Build normal wheels - if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} + - name: Build wheels uses: pypa/cibuildwheel@v2.17.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} @@ -148,18 +147,6 @@ jobs: CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} - - name: Build nightly wheels (with NumPy pre-release) - if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }} - uses: pypa/cibuildwheel@v2.17.0 - with: - package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} - env: - # The nightly wheels should be build witht he NumPy 2.0 pre-releases - # which requires the additional URL. - CIBW_ENVIRONMENT: PIP_EXTRA_INDEX_URL=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple - CIBW_PRERELEASE_PYTHONS: True - CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} - - name: Set up Python uses: mamba-org/setup-micromamba@v1 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 41f1c4c6892a3..8c546ccae41e5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,10 +16,10 @@ ci: autofix_prs: false autoupdate_schedule: monthly # manual stage hooks - skip: [pylint, pyright, mypy] + skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.1 + rev: v0.3.4 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -30,16 +30,10 @@ repos: files: ^pandas exclude: ^pandas/tests args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix] - - id: ruff - name: ruff-use-pd_array-in-core - alias: ruff-use-pd_array-in-core - files: ^pandas/core/ - exclude: ^pandas/core/api\.py$ - args: [--select, "ICN001", --exit-non-zero-on-fix] - id: ruff-format exclude: ^scripts - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.10' + rev: 'v2.11' hooks: - id: vulture entry: python scripts/run_vulture.py @@ -73,31 +67,12 @@ repos: - id: fix-encoding-pragma args: [--remove] - id: trailing-whitespace -- repo: https://github.com/pylint-dev/pylint - rev: v3.0.1 - hooks: - - id: pylint - stages: [manual] - args: [--load-plugins=pylint.extensions.redefined_loop_name, --fail-on=I0021] - - id: pylint - alias: redefined-outer-name - name: Redefining name from outer scope - files: ^pandas/ - exclude: | - (?x) - ^pandas/tests # keep excluded - |/_testing/ # keep excluded - |^pandas/util/_test_decorators\.py # keep excluded - |^pandas/_version\.py # keep excluded - |^pandas/conftest\.py # keep excluded - args: [--disable=all, --enable=redefined-outer-name] - stages: [manual] - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.15.0 + rev: v3.15.2 hooks: - id: pyupgrade args: [--py39-plus] @@ -116,7 +91,7 @@ repos: hooks: - id: sphinx-lint - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v17.0.6 + rev: v18.1.2 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index e02ff26ba14e9..30c692115eab1 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -41,6 +41,7 @@ // pip (with all the conda available packages installed first, // followed by the pip installed packages). "matrix": { + "pip+build": [], "Cython": ["3.0"], "matplotlib": [], "sqlalchemy": [], diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 69697906e493e..7d5b250c7b157 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -24,7 +24,7 @@ def setup(self): self.codes = np.tile(range(len(self.categories)), N) self.datetimes = pd.Series( - pd.date_range("1995-01-01 00:00:00", periods=N / 10, freq="s") + pd.date_range("1995-01-01 00:00:00", periods=N // 10, freq="s") ) self.datetimes_with_nat = self.datetimes.copy() self.datetimes_with_nat.iloc[-1] = pd.NaT diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index ce31d63f0b70f..6a2ab24df26fe 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -862,4 +862,28 @@ def time_last_valid_index(self, dtype): self.df.last_valid_index() +class Update: + def setup(self): + rng = np.random.default_rng() + self.df = DataFrame(rng.uniform(size=(1_000_000, 10))) + + idx = rng.choice(range(1_000_000), size=1_000_000, replace=False) + self.df_random = DataFrame(self.df, index=idx) + + idx = rng.choice(range(1_000_000), size=100_000, replace=False) + cols = rng.choice(range(10), size=2, replace=False) + self.df_sample = DataFrame( + rng.uniform(size=(100_000, 2)), index=idx, columns=cols + ) + + def time_to_update_big_frame_small_arg(self): + self.df.update(self.df_sample) + + def time_to_update_random_indices(self): + self.df_random.update(self.df_sample) + + def time_to_update_small_frame_big_arg(self): + self.df_sample.update(self.df) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 06f488f7baaaf..8deec502898d9 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -29,7 +29,7 @@ def setup(self, index_type): "dst": date_range( start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="s" ), - "repeated": date_range(start="2000", periods=N / 10, freq="s").repeat(10), + "repeated": date_range(start="2000", periods=N // 10, freq="s").repeat(10), "tz_aware": date_range(start="2000", periods=N, freq="s", tz="US/Eastern"), "tz_local": date_range( start="2000", periods=N, freq="s", tz=dateutil.tz.tzlocal() diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 0c4e6641444f1..fbd71af9b60de 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -81,30 +81,21 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.CategoricalIndex.ordered SA01" \ -i "pandas.DataFrame.__dataframe__ SA01" \ -i "pandas.DataFrame.__iter__ SA01" \ - -i "pandas.DataFrame.assign SA01" \ -i "pandas.DataFrame.at_time PR01" \ - -i "pandas.DataFrame.axes SA01" \ - -i "pandas.DataFrame.backfill PR01,SA01" \ - -i "pandas.DataFrame.bfill SA01" \ -i "pandas.DataFrame.columns SA01" \ -i "pandas.DataFrame.copy SA01" \ -i "pandas.DataFrame.droplevel SA01" \ - -i "pandas.DataFrame.dtypes SA01" \ - -i "pandas.DataFrame.ffill SA01" \ -i "pandas.DataFrame.first_valid_index SA01" \ - -i "pandas.DataFrame.get SA01" \ -i "pandas.DataFrame.hist RT03" \ -i "pandas.DataFrame.infer_objects RT03" \ -i "pandas.DataFrame.keys SA01" \ -i "pandas.DataFrame.kurt RT03,SA01" \ -i "pandas.DataFrame.kurtosis RT03,SA01" \ -i "pandas.DataFrame.last_valid_index SA01" \ - -i "pandas.DataFrame.mask RT03" \ -i "pandas.DataFrame.max RT03" \ -i "pandas.DataFrame.mean RT03,SA01" \ -i "pandas.DataFrame.median RT03,SA01" \ -i "pandas.DataFrame.min RT03" \ - -i "pandas.DataFrame.pad PR01,SA01" \ -i "pandas.DataFrame.plot PR02,SA01" \ -i "pandas.DataFrame.pop SA01" \ -i "pandas.DataFrame.prod RT03" \ @@ -119,19 +110,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.sparse.to_dense SA01" \ -i "pandas.DataFrame.std PR01,RT03,SA01" \ -i "pandas.DataFrame.sum RT03" \ - -i "pandas.DataFrame.swapaxes PR01,SA01" \ -i "pandas.DataFrame.swaplevel SA01" \ -i "pandas.DataFrame.to_feather SA01" \ -i "pandas.DataFrame.to_markdown SA01" \ -i "pandas.DataFrame.to_parquet RT03" \ - -i "pandas.DataFrame.to_period SA01" \ - -i "pandas.DataFrame.to_timestamp SA01" \ - -i "pandas.DataFrame.tz_convert SA01" \ - -i "pandas.DataFrame.tz_localize SA01" \ - -i "pandas.DataFrame.unstack RT03" \ - -i "pandas.DataFrame.value_counts RT03" \ -i "pandas.DataFrame.var PR01,RT03,SA01" \ - -i "pandas.DataFrame.where RT03" \ -i "pandas.DatetimeIndex.ceil SA01" \ -i "pandas.DatetimeIndex.date SA01" \ -i "pandas.DatetimeIndex.day SA01" \ @@ -165,12 +148,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeTZDtype SA01" \ -i "pandas.DatetimeTZDtype.tz SA01" \ -i "pandas.DatetimeTZDtype.unit SA01" \ - -i "pandas.ExcelFile PR01,SA01" \ - -i "pandas.ExcelFile.parse PR01,SA01" \ - -i "pandas.ExcelWriter SA01" \ - -i "pandas.Float32Dtype SA01" \ - -i "pandas.Float64Dtype SA01" \ - -i "pandas.Grouper PR02,SA01" \ + -i "pandas.Grouper PR02" \ -i "pandas.HDFStore.append PR01,SA01" \ -i "pandas.HDFStore.get SA01" \ -i "pandas.HDFStore.groups SA01" \ @@ -226,7 +204,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.to_list RT03" \ -i "pandas.Index.union PR07,RT03,SA01" \ -i "pandas.Index.unique RT03" \ - -i "pandas.Index.value_counts RT03" \ -i "pandas.Index.view GL08" \ -i "pandas.Int16Dtype SA01" \ -i "pandas.Int32Dtype SA01" \ @@ -321,7 +298,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.add PR07" \ -i "pandas.Series.at_time PR01" \ -i "pandas.Series.backfill PR01,SA01" \ - -i "pandas.Series.bfill SA01" \ -i "pandas.Series.case_when RT03" \ -i "pandas.Series.cat PR07,SA01" \ -i "pandas.Series.cat.add_categories PR01,PR02" \ @@ -376,14 +352,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.dt.year SA01" \ -i "pandas.Series.dtype SA01" \ - -i "pandas.Series.dtypes SA01" \ -i "pandas.Series.empty GL08" \ -i "pandas.Series.eq PR07,SA01" \ - -i "pandas.Series.ffill SA01" \ -i "pandas.Series.first_valid_index SA01" \ -i "pandas.Series.floordiv PR07" \ -i "pandas.Series.ge PR07,SA01" \ - -i "pandas.Series.get SA01" \ -i "pandas.Series.gt PR07,SA01" \ -i "pandas.Series.hasnans SA01" \ -i "pandas.Series.infer_objects RT03" \ @@ -400,7 +373,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.list.flatten SA01" \ -i "pandas.Series.list.len SA01" \ -i "pandas.Series.lt PR07,SA01" \ - -i "pandas.Series.mask RT03" \ -i "pandas.Series.max RT03" \ -i "pandas.Series.mean RT03,SA01" \ -i "pandas.Series.median RT03,SA01" \ @@ -477,17 +449,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.to_frame SA01" \ -i "pandas.Series.to_list RT03" \ -i "pandas.Series.to_markdown SA01" \ - -i "pandas.Series.to_period SA01" \ -i "pandas.Series.to_string SA01" \ - -i "pandas.Series.to_timestamp RT03,SA01" \ -i "pandas.Series.truediv PR07" \ - -i "pandas.Series.tz_convert SA01" \ - -i "pandas.Series.tz_localize SA01" \ - -i "pandas.Series.unstack SA01" \ -i "pandas.Series.update PR07,SA01" \ - -i "pandas.Series.value_counts RT03" \ -i "pandas.Series.var PR01,RT03,SA01" \ - -i "pandas.Series.where RT03" \ -i "pandas.SparseDtype SA01" \ -i "pandas.Timedelta PR07,SA01" \ -i "pandas.Timedelta.as_unit SA01" \ @@ -681,60 +646,40 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \ - -i "pandas.core.groupby.DataFrameGroupBy.apply RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.cummax RT03" \ - -i "pandas.core.groupby.DataFrameGroupBy.cummin RT03" \ - -i "pandas.core.groupby.DataFrameGroupBy.cumprod RT03" \ - -i "pandas.core.groupby.DataFrameGroupBy.cumsum RT03" \ - -i "pandas.core.groupby.DataFrameGroupBy.filter RT03,SA01" \ + -i "pandas.core.groupby.DataFrameGroupBy.filter SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.max SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.mean RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.median SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.min SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ - -i "pandas.core.groupby.DataFrameGroupBy.nunique RT03,SA01" \ + -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.prod SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.rank RT03" \ - -i "pandas.core.groupby.DataFrameGroupBy.resample RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.skew RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.transform RT03" \ -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.agg RT03" \ -i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.apply RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.cummax RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.cummin RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.cumprod RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.cumsum RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.filter PR01,RT03,SA01" \ + -i "pandas.core.groupby.SeriesGroupBy.filter PR01,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.groups SA01" \ -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \ -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \ -i "pandas.core.groupby.SeriesGroupBy.max SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.mean RT03" \ -i "pandas.core.groupby.SeriesGroupBy.median SA01" \ -i "pandas.core.groupby.SeriesGroupBy.min SA01" \ -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.prod SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.rank RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.resample RT03" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.skew RT03" \ -i "pandas.core.groupby.SeriesGroupBy.sum SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.transform RT03" \ -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \ -i "pandas.core.resample.Resampler.ffill RT03" \ -i "pandas.core.resample.Resampler.get_group RT03,SA01" \ diff --git a/doc/redirects.csv b/doc/redirects.csv index e71a031bd67fd..c11e4e242f128 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -1422,7 +1422,6 @@ reference/api/pandas.Series.transpose,pandas.Series.T reference/api/pandas.Index.transpose,pandas.Index.T reference/api/pandas.Index.notnull,pandas.Index.notna reference/api/pandas.Index.tolist,pandas.Index.to_list -reference/api/pandas.arrays.PandasArray,pandas.arrays.NumpyExtensionArray reference/api/pandas.core.groupby.DataFrameGroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill reference/api/pandas.core.groupby.GroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill reference/api/pandas.core.resample.Resampler.backfill,pandas.core.resample.Resampler.bfill diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 3cd9e030d6b3c..cf5f15ceb8344 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -269,6 +269,8 @@ SciPy 1.10.0 computation Miscellaneous stati xarray 2022.12.0 computation pandas-like API for N-dimensional data ========================= ================== =============== ============================================================= +.. _install.excel_dependencies: + Excel files ^^^^^^^^^^^ diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index ae658ec6abbaf..aa032b186aeb9 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -111,6 +111,12 @@ strings (``object``). My colleague requested the Titanic data as a spreadsheet. +.. note:: + If you want to use :func:`~pandas.to_excel` and :func:`~pandas.read_excel`, + you need to install an Excel reader as outlined in the + :ref:`Excel files ` section of the + installation documentation. + .. ipython:: python titanic.to_excel("titanic.xlsx", sheet_name="passengers", index=False) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index eed3fc149263a..92799359a61d2 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -476,15 +476,15 @@ For example: .. ipython:: python df - df.mean(0) - df.mean(1) + df.mean(axis=0) + df.mean(axis=1) All such methods have a ``skipna`` option signaling whether to exclude missing data (``True`` by default): .. ipython:: python - df.sum(0, skipna=False) + df.sum(axis=0, skipna=False) df.sum(axis=1, skipna=True) Combined with the broadcasting / arithmetic behavior, one can describe various @@ -495,8 +495,8 @@ standard deviation of 1), very concisely: ts_stand = (df - df.mean()) / df.std() ts_stand.std() - xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0) - xs_stand.std(1) + xs_stand = df.sub(df.mean(axis=1), axis=0).div(df.std(axis=1), axis=0) + xs_stand.std(axis=1) Note that methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` preserve the location of ``NaN`` values. This is somewhat different from diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 7f957a8b16787..8c222aff52fd7 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -416,6 +416,12 @@ You can also include the grouping columns if you want to operate on them. grouped[["A", "B"]].sum() +.. note:: + + The ``groupby`` operation in Pandas drops the ``name`` field of the columns Index object + after the operation. This change ensures consistency in syntax between different + column selection methods within groupby operations. + .. _groupby.iterating-label: Iterating through groups diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 24cdbad41fe60..0da87e1d31fec 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -262,6 +262,10 @@ The most robust and consistent way of slicing ranges along arbitrary axes is described in the :ref:`Selection by Position ` section detailing the ``.iloc`` method. For now, we explain the semantics of slicing using the ``[]`` operator. + .. note:: + + When the :class:`Series` has float indices, slicing will select by position. + With Series, the syntax works exactly as with an ndarray, returning a slice of the values and the corresponding labels: @@ -948,7 +952,7 @@ To select a row where each column meets its own criterion: values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]} - row_mask = df.isin(values).all(1) + row_mask = df.isin(values).all(axis=1) df[row_mask] diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 2e104ac06f9f4..5149bd30dbbef 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -386,6 +386,27 @@ Replace NA with a scalar value df df.fillna(0) +When the data has object dtype, you can control what type of NA values are present. + +.. ipython:: python + + df = pd.DataFrame({"a": [pd.NA, np.nan, None]}, dtype=object) + df + df.fillna(None) + df.fillna(np.nan) + df.fillna(pd.NA) + +However when the dtype is not object, these will all be replaced with the proper NA value for the dtype. + +.. ipython:: python + + data = {"np": [1.0, np.nan, np.nan, 2], "arrow": pd.array([1.0, pd.NA, pd.NA, 2], dtype="float64[pyarrow]")} + df = pd.DataFrame(data) + df + df.fillna(None) + df.fillna(np.nan) + df.fillna(pd.NA) + Fill gaps forward or backward .. ipython:: python diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 310dd921e44f6..4db0069ec4b95 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -87,4 +87,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.2.0..v2.2.1|HEAD +.. contributors:: v2.2.0..v2.2.1 diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 0dac3660c76b2..72a2f84c4aaee 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -1,6 +1,6 @@ .. _whatsnew_222: -What's new in 2.2.2 (April XX, 2024) +What's new in 2.2.2 (April 10, 2024) --------------------------------------- These are the changes in pandas 2.2.2. See :ref:`release` for a full changelog @@ -9,6 +9,21 @@ including other versions of pandas. {{ header }} .. --------------------------------------------------------------------------- + +.. _whatsnew_220.np2_compat: + +Pandas 2.2.2 is now compatible with numpy 2.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas 2.2.2 is the first version of pandas that is generally compatible with the upcoming +numpy 2.0 release, and wheels for pandas 2.2.2 will work with both numpy 1.x and 2.x. + +One major caveat is that arrays created with numpy 2.0's new ``StringDtype`` will convert +to ``object`` dtyped arrays upon :class:`Series`/:class:`DataFrame` creation. +Full support for numpy 2.0's StringDtype is expected to land in pandas 3.0. + +As usual please report any bugs discovered to our `issue tracker `_ + .. _whatsnew_222.regressions: Fixed regressions @@ -40,3 +55,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.2.1..v2.2.2|HEAD diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 74a19472ec835..7a4f709e56104 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -28,14 +28,17 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ +- :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) +- Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`) - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) -- +- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) +- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: @@ -146,7 +149,6 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`) -- :attr:`MultiIndex.codes`, :attr:`MultiIndex.levels`, and :attr:`MultiIndex.names` now returns a ``tuple`` instead of a ``FrozenList`` (:issue:`53531`) - :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`) - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) @@ -190,6 +192,7 @@ Other Deprecations - Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`) - Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`) - Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`) - Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`) - @@ -207,20 +210,28 @@ Removal of prior version deprecations/changes - :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`) - All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`) - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`) +- Disallow allowing logical operations (``||``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``); wrap the objects in :class:`Series`, :class:`Index`, or ``np.array`` first instead (:issue:`52264`) +- Disallow automatic casting to object in :class:`Series` logical operations (``&``, ``^``, ``||``) between series with mismatched indexes and dtypes other than ``object`` or ``bool`` (:issue:`52538`) +- Disallow calling :meth:`Series.replace` or :meth:`DataFrame.replace` without a ``value`` and with non-dict-like ``to_replace`` (:issue:`33302`) +- Disallow constructing a :class:`arrays.SparseArray` with scalar data (:issue:`53039`) - Disallow non-standard (``np.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series`) to :func:`isin`, :func:`unique`, :func:`factorize` (:issue:`52986`) - Disallow passing a pandas type to :meth:`Index.view` (:issue:`55709`) - Disallow units other than "s", "ms", "us", "ns" for datetime64 and timedelta64 dtypes in :func:`array` (:issue:`53817`) - Removed "freq" keyword from :class:`PeriodArray` constructor, use "dtype" instead (:issue:`52462`) +- Removed 'fastpath' keyword in :class:`Categorical` constructor (:issue:`20110`) +- Removed alias :class:`arrays.PandasArray` for :class:`arrays.NumpyExtensionArray` (:issue:`53694`) - Removed deprecated "method" and "limit" keywords from :meth:`Series.replace` and :meth:`DataFrame.replace` (:issue:`53492`) - Removed extension test classes ``BaseNoReduceTests``, ``BaseNumericReduceTests``, ``BaseBooleanReduceTests`` (:issue:`54663`) - Removed the "closed" and "normalize" keywords in :meth:`DatetimeIndex.__new__` (:issue:`52628`) +- Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`) - Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`) - Removed the "closed" and "unit" keywords in :meth:`TimedeltaIndex.__new__` (:issue:`52628`, :issue:`55499`) - All arguments in :meth:`Index.sort_values` are now keyword only (:issue:`56493`) - All arguments in :meth:`Series.to_dict` are now keyword only (:issue:`56493`) - Changed the default value of ``observed`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` to ``True`` (:issue:`51811`) - Enforce deprecation in :func:`testing.assert_series_equal` and :func:`testing.assert_frame_equal` with object dtype and mismatched null-like values, which are now considered not-equal (:issue:`18463`) -- Enforced deprecation ``all`` and ``any`` reductions with ``datetime64`` and :class:`DatetimeTZDtype` dtypes (:issue:`58029`) +- Enforced deprecation ``all`` and ``any`` reductions with ``datetime64``, :class:`DatetimeTZDtype`, and :class:`PeriodDtype` dtypes (:issue:`58029`) +- Enforced deprecation disallowing ``float`` "periods" in :func:`date_range`, :func:`period_range`, :func:`timedelta_range`, :func:`interval_range`, (:issue:`56036`) - Enforced deprecation disallowing parsing datetimes with mixed time zones unless user passes ``utc=True`` to :func:`to_datetime` (:issue:`57275`) - Enforced deprecation in :meth:`Series.value_counts` and :meth:`Index.value_counts` with object dtype performing dtype inference on the ``.index`` of the result (:issue:`56161`) - Enforced deprecation of :meth:`.DataFrameGroupBy.get_group` and :meth:`.SeriesGroupBy.get_group` allowing the ``name`` argument to be a non-tuple when grouping by a list of length 1 (:issue:`54155`) @@ -296,6 +307,8 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - :attr:`Categorical.categories` returns a :class:`RangeIndex` columns instead of an :class:`Index` if the constructed ``values`` was a ``range``. (:issue:`57787`) - :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`57943`) +- :class:`Series` returns a :class:`RangeIndex` index when possible when ``data`` is a ``dict`` (:issue:`58118`) +- :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`) - :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`) - :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) @@ -318,6 +331,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) +- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`) @@ -327,17 +341,6 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Fixed bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) -- Fixed bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) -- Fixed bug in :meth:`DataFrame.cumsum` which was raising ``IndexError`` if dtype is ``timedelta64[ns]`` (:issue:`57956`) -- Fixed bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) -- Fixed bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) -- Fixed bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) -- Fixed bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) -- Fixed bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) -- Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) -- Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) -- Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) Categorical ^^^^^^^^^^^ @@ -346,14 +349,15 @@ Categorical Datetimelike ^^^^^^^^^^^^ +- Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`) -- +- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) Timedelta ^^^^^^^^^ - Accuracy improvement in :meth:`Timedelta.to_pytimedelta` to round microseconds consistently for large nanosecond based Timedelta (:issue:`57841`) -- +- Bug in :meth:`DataFrame.cumsum` which was raising ``IndexError`` if dtype is ``timedelta64[ns]`` (:issue:`57956`) Timezones ^^^^^^^^^ @@ -367,6 +371,7 @@ Numeric Conversion ^^^^^^^^^^ +- Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) @@ -382,7 +387,7 @@ Interval Indexing ^^^^^^^^ -- +- Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) - Missing @@ -397,10 +402,10 @@ MultiIndex I/O ^^^ +- Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) -- Now all ``Mapping`` s are pretty printed correctly. Before only literal ``dict`` s were. (:issue:`57915`) -- -- +- Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) +- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) Period ^^^^^^ @@ -409,29 +414,31 @@ Period Plotting ^^^^^^^^ -- +- Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`) - Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) +- Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) -- +- Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) + Reshaping ^^^^^^^^^ -- +- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Sparse ^^^^^^ -- +- Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) - ExtensionArray ^^^^^^^^^^^^^^ -- Fixed bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) +- Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) - Styler @@ -441,10 +448,15 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) -- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) +- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) +- Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) +- Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) +- Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) +- Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) .. ***DO NOT USE THIS SECTION*** diff --git a/environment.yml b/environment.yml index 186d7e1d703df..c8f55621070ae 100644 --- a/environment.yml +++ b/environment.yml @@ -89,6 +89,7 @@ dependencies: - numpydoc - pydata-sphinx-theme=0.14 - pytest-cython # doctest + - docutils < 0.21 # https://github.com/sphinx-doc/sphinx/issues/12302 - sphinx - sphinx-design - sphinx-copybutton diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index e2e93c5242b24..f2b4baf508986 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -351,7 +351,7 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): Py_ssize_t i, xi, yi, N, K int64_t minpv float64_t[:, ::1] result - ndarray[uint8_t, ndim=2] mask + uint8_t[:, :] mask int64_t nobs = 0 float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy @@ -407,7 +407,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr Py_ssize_t i, xi, yi, N, K ndarray[float64_t, ndim=2] result ndarray[float64_t, ndim=2] ranked_mat - ndarray[float64_t, ndim=1] rankedx, rankedy + float64_t[::1] rankedx, rankedy float64_t[::1] maskedx, maskedy ndarray[uint8_t, ndim=2] mask int64_t nobs = 0 @@ -566,8 +566,8 @@ def get_fill_indexer(const uint8_t[:] mask, limit=None): @cython.boundscheck(False) @cython.wraparound(False) def pad( - ndarray[numeric_object_t] old, - ndarray[numeric_object_t] new, + const numeric_object_t[:] old, + const numeric_object_t[:] new, limit=None ) -> ndarray: # -> ndarray[intp_t, ndim=1] @@ -691,8 +691,8 @@ def pad_2d_inplace(numeric_object_t[:, :] values, uint8_t[:, :] mask, limit=None @cython.boundscheck(False) @cython.wraparound(False) def backfill( - ndarray[numeric_object_t] old, - ndarray[numeric_object_t] new, + const numeric_object_t[:] old, + const numeric_object_t[:] new, limit=None ) -> ndarray: # -> ndarray[intp_t, ndim=1] """ @@ -786,7 +786,7 @@ def backfill_2d_inplace(numeric_object_t[:, :] values, @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic(ndarray[numeric_object_t, ndim=1] arr, bint timelike): +def is_monotonic(const numeric_object_t[:] arr, bint timelike): """ Returns ------- @@ -1089,8 +1089,7 @@ cdef void rank_sorted_1d( float64_t[::1] out, int64_t[::1] grp_sizes, const intp_t[:] sort_indexer, - # TODO(cython3): make const (https://github.com/cython/cython/issues/3222) - numeric_object_t[:] masked_vals, + const numeric_object_t[:] masked_vals, const uint8_t[:] mask, bint check_mask, Py_ssize_t N, @@ -1378,7 +1377,7 @@ ctypedef fused out_t: @cython.boundscheck(False) @cython.wraparound(False) def diff_2d( - ndarray[diff_t, ndim=2] arr, # TODO(cython3) update to "const diff_t[:, :] arr" + const diff_t[:, :] arr, ndarray[out_t, ndim=2] out, Py_ssize_t periods, int axis, @@ -1386,8 +1385,7 @@ def diff_2d( ): cdef: Py_ssize_t i, j, sx, sy, start, stop - bint f_contig = arr.flags.f_contiguous - # bint f_contig = arr.is_f_contig() # TODO(cython3) once arr is memoryview + bint f_contig = arr.is_f_contig() diff_t left, right # Disable for unsupported dtype combinations, diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c0b9ed42cb535..15f8727c38f8d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -511,9 +511,9 @@ def group_shift_indexer( @cython.wraparound(False) @cython.boundscheck(False) def group_fillna_indexer( - ndarray[intp_t] out, - ndarray[intp_t] labels, - ndarray[uint8_t] mask, + Py_ssize_t[::1] out, + const intp_t[::1] labels, + const uint8_t[:] mask, int64_t limit, bint compute_ffill, int ngroups, @@ -1179,13 +1179,13 @@ def group_ohlc( @cython.boundscheck(False) @cython.wraparound(False) def group_quantile( - ndarray[float64_t, ndim=2] out, + float64_t[:, ::1] out, ndarray[numeric_t, ndim=1] values, - ndarray[intp_t] labels, + const intp_t[::1] labels, const uint8_t[:] mask, const float64_t[:] qs, - ndarray[int64_t] starts, - ndarray[int64_t] ends, + const int64_t[::1] starts, + const int64_t[::1] ends, str interpolation, uint8_t[:, ::1] result_mask, bint is_datetimelike, @@ -1388,7 +1388,7 @@ cdef inline void _check_below_mincount( uint8_t[:, ::1] result_mask, Py_ssize_t ncounts, Py_ssize_t K, - int64_t[:, ::1] nobs, + const int64_t[:, ::1] nobs, int64_t min_count, mincount_t[:, ::1] resx, ) noexcept: @@ -1435,14 +1435,12 @@ cdef inline void _check_below_mincount( out[i, j] = 0 -# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can -# use `const numeric_object_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) def group_last( numeric_object_t[:, ::1] out, int64_t[::1] counts, - ndarray[numeric_object_t, ndim=2] values, + const numeric_object_t[:, :] values, const intp_t[::1] labels, const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask=None, @@ -1502,14 +1500,12 @@ def group_last( ) -# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can -# use `const numeric_object_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) def group_nth( numeric_object_t[:, ::1] out, int64_t[::1] counts, - ndarray[numeric_object_t, ndim=2] values, + const numeric_object_t[:, :] values, const intp_t[::1] labels, const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask=None, diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index 8b424e96973d3..a9bf784d5f973 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -11,7 +11,6 @@ import numpy as np from numpy cimport ( import_array, - ndarray, uint8_t, uint64_t, ) @@ -23,7 +22,7 @@ from pandas._libs.util cimport is_nan @cython.boundscheck(False) def hash_object_array( - ndarray[object] arr, str key, str encoding="utf8" + object[:] arr, str key, str encoding="utf8" ) -> np.ndarray[np.uint64]: """ Parameters diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index e3a9102fec395..5c6254c6a1ec7 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -3,7 +3,7 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ - +from cpython.unicode cimport PyUnicode_AsUTF8 {{py: @@ -98,7 +98,6 @@ from pandas._libs.khash cimport ( # VectorData # ---------------------------------------------------------------------- -from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA @@ -998,7 +997,7 @@ cdef class StringHashTable(HashTable): cdef: khiter_t k const char *v - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) k = kh_get_str(self.table, v) if k != self.table.n_buckets: @@ -1012,7 +1011,7 @@ cdef class StringHashTable(HashTable): int ret = 0 const char *v - v = get_c_string(key) + v = PyUnicode_AsUTF8(key) k = kh_put_str(self.table, v, &ret) if kh_exist_str(self.table, k): @@ -1037,7 +1036,7 @@ cdef class StringHashTable(HashTable): raise MemoryError() for i in range(n): val = values[i] - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) vecs[i] = v with nogil: @@ -1071,11 +1070,11 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, str): - # GH#31499 if we have a np.str_ get_c_string won't recognize + # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize # it as a str, even though isinstance does. - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) else: - v = get_c_string(self.na_string_sentinel) + v = PyUnicode_AsUTF8(self.na_string_sentinel) vecs[i] = v with nogil: @@ -1109,11 +1108,11 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, str): - # GH#31499 if we have a np.str_ get_c_string won't recognize + # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize # it as a str, even though isinstance does. - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) else: - v = get_c_string(self.na_string_sentinel) + v = PyUnicode_AsUTF8(self.na_string_sentinel) vecs[i] = v with nogil: @@ -1195,9 +1194,9 @@ cdef class StringHashTable(HashTable): else: # if ignore_na is False, we also stringify NaN/None/etc. try: - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) except UnicodeEncodeError: - v = get_c_string(repr(val)) + v = PyUnicode_AsUTF8(repr(val)) vecs[i] = v # compute diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 8cd135c944dc6..12a5bf245977e 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -73,13 +73,13 @@ class MaskedUInt8Engine(MaskedIndexEngine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... class BaseMultiIndexCodesEngine: - levels: tuple[np.ndarray] + levels: list[np.ndarray] offsets: np.ndarray # ndarray[uint64_t, ndim=1] def __init__( self, - levels: tuple[Index, ...], # all entries hashable - labels: tuple[np.ndarray], # all entries integer-dtyped + levels: list[Index], # all entries hashable + labels: list[np.ndarray], # all entries integer-dtyped offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1] ) -> None: ... def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ... diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index b39d32d069619..daaaacee3487d 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -67,7 +67,6 @@ def fast_multiget( default=..., ) -> ArrayLike: ... def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ... -def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ... @overload def map_infer( arr: np.ndarray, diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a2205454a5a46..7aa1cb715521e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -312,34 +312,6 @@ def item_from_zerodim(val: object) -> object: return val -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_unique_multiple_list(lists: list, sort: bool | None = True) -> list: - cdef: - list buf - Py_ssize_t k = len(lists) - Py_ssize_t i, j, n - list uniques = [] - dict table = {} - object val, stub = 0 - - for i in range(k): - buf = lists[i] - n = len(buf) - for j in range(n): - val = buf[j] - if val not in table: - table[val] = stub - uniques.append(val) - if sort: - try: - uniques.sort() - except TypeError: - pass - - return uniques - - @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list: @@ -361,15 +333,15 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list: list buf Py_ssize_t j, n list uniques = [] - dict table = {} - object val, stub = 0 + set table = set() + object val for buf in gen: n = len(buf) for j in range(n): val = buf[j] if val not in table: - table[val] = stub + table.add(val) uniques.append(val) if sort: try: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index effd7f586f266..aecf9f2e46bd4 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -70,11 +70,9 @@ from pandas._libs.tslibs.conversion cimport ( from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev from pandas._libs.tslibs.nattype cimport ( NPY_NAT, - c_NaT as NaT, c_nat_strings as nat_strings, ) from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs import ( Resolution, @@ -347,39 +345,6 @@ def array_with_unit_to_datetime( return result, tz -cdef _array_with_unit_to_datetime_object_fallback(ndarray[object] values, str unit): - cdef: - Py_ssize_t i, n = len(values) - ndarray[object] oresult - tzinfo tz = None - - # TODO: fix subtle differences between this and no-unit code - oresult = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0) - for i in range(n): - val = values[i] - - if checknull_with_nat_and_na(val): - oresult[i] = NaT - elif is_integer_object(val) or is_float_object(val): - - if val != val or val == NPY_NAT: - oresult[i] = NaT - else: - try: - oresult[i] = Timestamp(val, unit=unit) - except OutOfBoundsDatetime: - oresult[i] = val - - elif isinstance(val, str): - if len(val) == 0 or val in nat_strings: - oresult[i] = NaT - - else: - oresult[i] = val - - return oresult, tz - - @cython.wraparound(False) @cython.boundscheck(False) def first_non_null(values: ndarray) -> int: @@ -452,13 +417,9 @@ cpdef array_to_datetime( ndarray[int64_t] iresult npy_datetimestruct dts bint utc_convert = bool(utc) - bint seen_datetime_offset = False bint is_raise = errors == "raise" bint is_coerce = errors == "coerce" - bint is_same_offsets _TSObject tsobj - float tz_offset - set out_tzoffset_vals = set() tzinfo tz, tz_out = None cnp.flatiter it = cnp.PyArray_IterNew(values) NPY_DATETIMEUNIT item_reso @@ -568,12 +529,12 @@ cpdef array_to_datetime( # dateutil timezone objects cannot be hashed, so # store the UTC offsets in seconds instead nsecs = tz.utcoffset(None).total_seconds() - out_tzoffset_vals.add(nsecs) - seen_datetime_offset = True + state.out_tzoffset_vals.add(nsecs) + state.found_aware_str = True else: # Add a marker for naive string, to track if we are # parsing mixed naive and aware strings - out_tzoffset_vals.add("naive") + state.out_tzoffset_vals.add("naive") state.found_naive_str = True else: @@ -588,41 +549,7 @@ cpdef array_to_datetime( raise return values, None - if seen_datetime_offset and not utc_convert: - # GH#17697, GH#57275 - # 1) If all the offsets are equal, return one offset for - # the parsed dates to (maybe) pass to DatetimeIndex - # 2) If the offsets are different, then do not force the parsing - # and raise a ValueError: "cannot parse datetimes with - # mixed time zones unless `utc=True`" instead - is_same_offsets = len(out_tzoffset_vals) == 1 - if not is_same_offsets: - raise ValueError( - "Mixed timezones detected. Pass utc=True in to_datetime " - "or tz='UTC' in DatetimeIndex to convert to a common timezone." - ) - elif state.found_naive or state.found_other: - # e.g. test_to_datetime_mixed_awareness_mixed_types - raise ValueError("Cannot mix tz-aware with tz-naive values") - elif tz_out is not None: - # GH#55693 - tz_offset = out_tzoffset_vals.pop() - tz_out2 = timezone(timedelta(seconds=tz_offset)) - if not tz_compare(tz_out, tz_out2): - # e.g. test_to_datetime_mixed_tzs_mixed_types - raise ValueError( - "Mixed timezones detected. Pass utc=True in to_datetime " - "or tz='UTC' in DatetimeIndex to convert to a common timezone." - ) - # e.g. test_to_datetime_mixed_types_matching_tzs - else: - tz_offset = out_tzoffset_vals.pop() - tz_out = timezone(timedelta(seconds=tz_offset)) - elif not utc_convert: - if tz_out and (state.found_other or state.found_naive_str): - # found_other indicates a tz-naive int, float, dt64, or date - # e.g. test_to_datetime_mixed_awareness_mixed_types - raise ValueError("Cannot mix tz-aware with tz-naive values") + tz_out = state.check_for_mixed_inputs(tz_out, utc) if infer_reso: if state.creso_ever_changed: diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 88a9a259ac8ec..31979b293a940 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -36,7 +36,7 @@ "is_supported_dtype", ] -from pandas._libs.tslibs import dtypes # pylint: disable=import-self +from pandas._libs.tslibs import dtypes from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.dtypes import ( Resolution, diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index aa01a05d0d932..61095b3f034fd 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -18,6 +18,7 @@ from cpython.object cimport ( Py_LT, Py_NE, ) +from cpython.unicode cimport PyUnicode_AsUTF8AndSize from libc.stdint cimport INT64_MAX import_datetime() @@ -44,7 +45,6 @@ from pandas._libs.tslibs.dtypes cimport ( npy_unit_to_abbrev, npy_unit_to_attrname, ) -from pandas._libs.tslibs.util cimport get_c_string_buf_and_size cdef extern from "pandas/datetime/pd_datetime.h": @@ -341,13 +341,13 @@ cdef int string_to_dts( const char* format_buf FormatRequirement format_requirement - buf = get_c_string_buf_and_size(val, &length) + buf = PyUnicode_AsUTF8AndSize(val, &length) if format is None: format_buf = b"" format_length = 0 format_requirement = INFER_FORMAT else: - format_buf = get_c_string_buf_and_size(format, &format_length) + format_buf = PyUnicode_AsUTF8AndSize(format, &format_length) format_requirement = exact return parse_iso_8601_datetime(buf, length, want_exc, dts, out_bestunit, out_local, out_tzoffset, diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index e36abdf0ad971..107608ec9f606 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -219,8 +219,7 @@ cdef _get_calendar(weekmask, holidays, calendar): holidays = holidays + calendar.holidays().tolist() except AttributeError: pass - holidays = [_to_dt64D(dt) for dt in holidays] - holidays = tuple(sorted(holidays)) + holidays = tuple(sorted(_to_dt64D(dt) for dt in holidays)) kwargs = {"weekmask": weekmask} if holidays: @@ -419,11 +418,10 @@ cdef class BaseOffset: if "holidays" in all_paras and not all_paras["holidays"]: all_paras.pop("holidays") - exclude = ["kwds", "name", "calendar"] - attrs = [(k, v) for k, v in all_paras.items() - if (k not in exclude) and (k[0] != "_")] - attrs = sorted(set(attrs)) - params = tuple([str(type(self))] + attrs) + exclude = {"kwds", "name", "calendar"} + attrs = {(k, v) for k, v in all_paras.items() + if (k not in exclude) and (k[0] != "_")} + params = tuple([str(type(self))] + sorted(attrs)) return params @property diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 384df1cac95eb..85ef3fd93ff09 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -19,6 +19,7 @@ from cpython.datetime cimport ( from datetime import timezone from cpython.object cimport PyObject_Str +from cpython.unicode cimport PyUnicode_AsUTF8AndSize from cython cimport Py_ssize_t from libc.string cimport strchr @@ -74,10 +75,7 @@ import_pandas_datetime() from pandas._libs.tslibs.strptime import array_strptime -from pandas._libs.tslibs.util cimport ( - get_c_string_buf_and_size, - is_array, -) +from pandas._libs.tslibs.util cimport is_array cdef extern from "pandas/portable.h": @@ -175,7 +173,7 @@ cdef datetime _parse_delimited_date( int day = 1, month = 1, year bint can_swap = 0 - buf = get_c_string_buf_and_size(date_string, &length) + buf = PyUnicode_AsUTF8AndSize(date_string, &length) if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]): # parsing MM?DD?YYYY and DD?MM?YYYY dates month = _parse_2digit(buf) @@ -251,7 +249,7 @@ cdef bint _does_string_look_like_time(str parse_string): Py_ssize_t length int hour = -1, minute = -1 - buf = get_c_string_buf_and_size(parse_string, &length) + buf = PyUnicode_AsUTF8AndSize(parse_string, &length) if length >= 4: if buf[1] == b":": # h:MM format @@ -467,7 +465,7 @@ cpdef bint _does_string_look_like_datetime(str py_string): char first int error = 0 - buf = get_c_string_buf_and_size(py_string, &length) + buf = PyUnicode_AsUTF8AndSize(py_string, &length) if length >= 1: first = buf[0] if first == b"0": @@ -521,7 +519,7 @@ cdef datetime _parse_dateabbr_string(str date_string, datetime default, pass if 4 <= date_len <= 7: - buf = get_c_string_buf_and_size(date_string, &date_len) + buf = PyUnicode_AsUTF8AndSize(date_string, &date_len) try: i = date_string.index("Q", 1, 6) if i == 1: diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd index dd8936f080b31..d2eae910a87b5 100644 --- a/pandas/_libs/tslibs/strptime.pxd +++ b/pandas/_libs/tslibs/strptime.pxd @@ -18,9 +18,12 @@ cdef class DatetimeParseState: bint found_tz bint found_naive bint found_naive_str + bint found_aware_str bint found_other bint creso_ever_changed NPY_DATETIMEUNIT creso + set out_tzoffset_vals cdef tzinfo process_datetime(self, datetime dt, tzinfo tz, bint utc_convert) cdef bint update_creso(self, NPY_DATETIMEUNIT item_reso) noexcept + cdef tzinfo check_for_mixed_inputs(self, tzinfo tz_out, bint utc) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 5c9f1c770ea7f..d6c3285d89c59 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -252,8 +252,11 @@ cdef class DatetimeParseState: # found_naive_str refers to a string that was parsed to a timezone-naive # datetime. self.found_naive_str = False + self.found_aware_str = False self.found_other = False + self.out_tzoffset_vals = set() + self.creso = creso self.creso_ever_changed = False @@ -292,6 +295,58 @@ cdef class DatetimeParseState: "tz-naive values") return tz + cdef tzinfo check_for_mixed_inputs( + self, + tzinfo tz_out, + bint utc, + ): + cdef: + bint is_same_offsets + float tz_offset + + if self.found_aware_str and not utc: + # GH#17697, GH#57275 + # 1) If all the offsets are equal, return one offset for + # the parsed dates to (maybe) pass to DatetimeIndex + # 2) If the offsets are different, then do not force the parsing + # and raise a ValueError: "cannot parse datetimes with + # mixed time zones unless `utc=True`" instead + is_same_offsets = len(self.out_tzoffset_vals) == 1 + if not is_same_offsets or (self.found_naive or self.found_other): + # e.g. test_to_datetime_mixed_awareness_mixed_types (array_to_datetime) + raise ValueError( + "Mixed timezones detected. Pass utc=True in to_datetime " + "or tz='UTC' in DatetimeIndex to convert to a common timezone." + ) + elif tz_out is not None: + # GH#55693 + tz_offset = self.out_tzoffset_vals.pop() + tz_out2 = timezone(timedelta(seconds=tz_offset)) + if not tz_compare(tz_out, tz_out2): + # e.g. (array_strptime) + # test_to_datetime_mixed_offsets_with_utc_false_removed + # e.g. test_to_datetime_mixed_tzs_mixed_types (array_to_datetime) + raise ValueError( + "Mixed timezones detected. Pass utc=True in to_datetime " + "or tz='UTC' in DatetimeIndex to convert to a common timezone." + ) + # e.g. (array_strptime) + # test_guess_datetime_format_with_parseable_formats + # e.g. test_to_datetime_mixed_types_matching_tzs (array_to_datetime) + else: + # e.g. test_to_datetime_iso8601_with_timezone_valid (array_strptime) + tz_offset = self.out_tzoffset_vals.pop() + tz_out = timezone(timedelta(seconds=tz_offset)) + elif not utc: + if tz_out and (self.found_other or self.found_naive_str): + # found_other indicates a tz-naive int, float, dt64, or date + # e.g. test_to_datetime_mixed_awareness_mixed_types (array_to_datetime) + raise ValueError( + "Mixed timezones detected. Pass utc=True in to_datetime " + "or tz='UTC' in DatetimeIndex to convert to a common timezone." + ) + return tz_out + def array_strptime( ndarray[object] values, @@ -319,11 +374,8 @@ def array_strptime( npy_datetimestruct dts int64_t[::1] iresult object val - bint seen_datetime_offset = False bint is_raise = errors=="raise" bint is_coerce = errors=="coerce" - bint is_same_offsets - set out_tzoffset_vals = set() tzinfo tz, tz_out = None bint iso_format = format_is_iso(fmt) NPY_DATETIMEUNIT out_bestunit, item_reso @@ -418,15 +470,15 @@ def array_strptime( ) from err if out_local == 1: nsecs = out_tzoffset * 60 - out_tzoffset_vals.add(nsecs) - seen_datetime_offset = True + state.out_tzoffset_vals.add(nsecs) + state.found_aware_str = True tz = timezone(timedelta(minutes=out_tzoffset)) value = tz_localize_to_utc_single( value, tz, ambiguous="raise", nonexistent=None, creso=creso ) else: tz = None - out_tzoffset_vals.add("naive") + state.out_tzoffset_vals.add("naive") state.found_naive_str = True iresult[i] = value continue @@ -475,12 +527,12 @@ def array_strptime( elif creso == NPY_DATETIMEUNIT.NPY_FR_ms: nsecs = nsecs // 10**3 - out_tzoffset_vals.add(nsecs) - seen_datetime_offset = True + state.out_tzoffset_vals.add(nsecs) + state.found_aware_str = True else: state.found_naive_str = True tz = None - out_tzoffset_vals.add("naive") + state.out_tzoffset_vals.add("naive") except ValueError as ex: ex.args = ( @@ -499,35 +551,7 @@ def array_strptime( raise return values, None - if seen_datetime_offset and not utc: - is_same_offsets = len(out_tzoffset_vals) == 1 - if not is_same_offsets or (state.found_naive or state.found_other): - raise ValueError( - "Mixed timezones detected. Pass utc=True in to_datetime " - "or tz='UTC' in DatetimeIndex to convert to a common timezone." - ) - elif tz_out is not None: - # GH#55693 - tz_offset = out_tzoffset_vals.pop() - tz_out2 = timezone(timedelta(seconds=tz_offset)) - if not tz_compare(tz_out, tz_out2): - # e.g. test_to_datetime_mixed_offsets_with_utc_false_removed - raise ValueError( - "Mixed timezones detected. Pass utc=True in to_datetime " - "or tz='UTC' in DatetimeIndex to convert to a common timezone." - ) - # e.g. test_guess_datetime_format_with_parseable_formats - else: - # e.g. test_to_datetime_iso8601_with_timezone_valid - tz_offset = out_tzoffset_vals.pop() - tz_out = timezone(timedelta(seconds=tz_offset)) - elif not utc: - if tz_out and (state.found_other or state.found_naive_str): - # found_other indicates a tz-naive int, float, dt64, or date - raise ValueError( - "Mixed timezones detected. Pass utc=True in to_datetime " - "or tz='UTC' in DatetimeIndex to convert to a common timezone." - ) + tz_out = state.check_for_mixed_inputs(tz_out, utc) if infer_reso: if state.creso_ever_changed: diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index d4cd90613ca5b..82daa6d942095 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1751,7 +1751,7 @@ class Timestamp(_Timestamp): tzinfo_type tzinfo=None, *, nanosecond=None, - tz=None, + tz=_no_input, unit=None, fold=None, ): @@ -1783,6 +1783,10 @@ class Timestamp(_Timestamp): _date_attributes = [year, month, day, hour, minute, second, microsecond, nanosecond] + explicit_tz_none = tz is None + if tz is _no_input: + tz = None + if tzinfo is not None: # GH#17690 tzinfo must be a datetime.tzinfo object, ensured # by the cython annotation. @@ -1883,6 +1887,11 @@ class Timestamp(_Timestamp): if ts.value == NPY_NAT: return NaT + if ts.tzinfo is not None and explicit_tz_none: + raise ValueError( + "Passed data is timezone-aware, incompatible with 'tz=None'." + ) + return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, ts.fold, ts.creso) def _round(self, freq, mode, ambiguous="raise", nonexistent="raise"): diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index a5822e57d3fa6..f144275e0ee6a 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -1,6 +1,5 @@ from cpython.object cimport PyTypeObject -from cpython.unicode cimport PyUnicode_AsUTF8AndSize cdef extern from "Python.h": @@ -155,36 +154,6 @@ cdef inline bint is_nan(object val): return is_complex_object(val) and val != val -cdef inline const char* get_c_string_buf_and_size(str py_string, - Py_ssize_t *length) except NULL: - """ - Extract internal char* buffer of unicode or bytes object `py_string` with - getting length of this internal buffer saved in `length`. - - Notes - ----- - Python object owns memory, thus returned char* must not be freed. - `length` can be NULL if getting buffer length is not needed. - - Parameters - ---------- - py_string : str - length : Py_ssize_t* - - Returns - ------- - buf : const char* - """ - # Note PyUnicode_AsUTF8AndSize() can - # potentially allocate memory inside in unlikely case of when underlying - # unicode object was stored as non-utf8 and utf8 wasn't requested before. - return PyUnicode_AsUTF8AndSize(py_string, length) - - -cdef inline const char* get_c_string(str py_string) except NULL: - return get_c_string_buf_and_size(py_string, NULL) - - cdef inline bytes string_encode_locale(str py_string): """As opposed to PyUnicode_Encode, use current system locale to encode.""" return PyUnicode_EncodeLocale(py_string, NULL) diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index d9516077788c8..cd2e2b4141ffd 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -11,6 +11,7 @@ from typing import ( TYPE_CHECKING, Literal, + Union, cast, ) import warnings @@ -32,7 +33,8 @@ def assert_produces_warning( ] = "always", check_stacklevel: bool = True, raise_on_extra_warnings: bool = True, - match: str | None = None, + match: str | tuple[str | None, ...] | None = None, + must_find_all_warnings: bool = True, ) -> Generator[list[warnings.WarningMessage], None, None]: """ Context manager for running code expected to either raise a specific warning, @@ -68,8 +70,15 @@ class for all warnings. To raise multiple types of exceptions, raise_on_extra_warnings : bool, default True Whether extra warnings not of the type `expected_warning` should cause the test to fail. - match : str, optional - Match warning message. + match : {str, tuple[str, ...]}, optional + Match warning message. If it's a tuple, it has to be the size of + `expected_warning`. If additionally `must_find_all_warnings` is + True, each expected warning's message gets matched with a respective + match. Otherwise, multiple values get treated as an alternative. + must_find_all_warnings : bool, default True + If True and `expected_warning` is a tuple, each expected warning + type must get encountered. Otherwise, even one expected warning + results in success. Examples -------- @@ -97,13 +106,35 @@ class for all warnings. To raise multiple types of exceptions, yield w finally: if expected_warning: - expected_warning = cast(type[Warning], expected_warning) - _assert_caught_expected_warning( - caught_warnings=w, - expected_warning=expected_warning, - match=match, - check_stacklevel=check_stacklevel, - ) + if isinstance(expected_warning, tuple) and must_find_all_warnings: + match = ( + match + if isinstance(match, tuple) + else (match,) * len(expected_warning) + ) + for warning_type, warning_match in zip(expected_warning, match): + _assert_caught_expected_warnings( + caught_warnings=w, + expected_warning=warning_type, + match=warning_match, + check_stacklevel=check_stacklevel, + ) + else: + expected_warning = cast( + Union[type[Warning], tuple[type[Warning], ...]], + expected_warning, + ) + match = ( + "|".join(m for m in match if m) + if isinstance(match, tuple) + else match + ) + _assert_caught_expected_warnings( + caught_warnings=w, + expected_warning=expected_warning, + match=match, + check_stacklevel=check_stacklevel, + ) if raise_on_extra_warnings: _assert_caught_no_extra_warnings( caught_warnings=w, @@ -123,10 +154,10 @@ def maybe_produces_warning( return nullcontext() -def _assert_caught_expected_warning( +def _assert_caught_expected_warnings( *, caught_warnings: Sequence[warnings.WarningMessage], - expected_warning: type[Warning], + expected_warning: type[Warning] | tuple[type[Warning], ...], match: str | None, check_stacklevel: bool, ) -> None: @@ -134,6 +165,11 @@ def _assert_caught_expected_warning( saw_warning = False matched_message = False unmatched_messages = [] + warning_name = ( + tuple(x.__name__ for x in expected_warning) + if isinstance(expected_warning, tuple) + else expected_warning.__name__ + ) for actual_warning in caught_warnings: if issubclass(actual_warning.category, expected_warning): @@ -149,13 +185,11 @@ def _assert_caught_expected_warning( unmatched_messages.append(actual_warning.message) if not saw_warning: - raise AssertionError( - f"Did not see expected warning of class {expected_warning.__name__!r}" - ) + raise AssertionError(f"Did not see expected warning of class {warning_name!r}") if match and not matched_message: raise AssertionError( - f"Did not see warning {expected_warning.__name__!r} " + f"Did not see warning {warning_name!r} " f"matching '{match}'. The emitted warning messages are " f"{unmatched_messages}" ) diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index b986e03e25815..7ebed8857f0af 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -173,7 +173,7 @@ def raises_chained_assignment_error(warn=True, extra_warnings=(), extra_match=() elif PYPY and extra_warnings: return assert_produces_warning( extra_warnings, - match="|".join(extra_match), + match=extra_match, ) else: if using_copy_on_write(): @@ -190,5 +190,5 @@ def raises_chained_assignment_error(warn=True, extra_warnings=(), extra_match=() warning = (warning, *extra_warnings) # type: ignore[assignment] return assert_produces_warning( warning, - match="|".join((match, *extra_match)), + match=(match, *extra_match), ) diff --git a/pandas/api/internals.py b/pandas/api/internals.py new file mode 100644 index 0000000000000..03d8992a87575 --- /dev/null +++ b/pandas/api/internals.py @@ -0,0 +1,62 @@ +import numpy as np + +from pandas._typing import ArrayLike + +from pandas import ( + DataFrame, + Index, +) +from pandas.core.internals.api import _make_block +from pandas.core.internals.managers import BlockManager as _BlockManager + + +def create_dataframe_from_blocks( + blocks: list[tuple[ArrayLike, np.ndarray]], index: Index, columns: Index +) -> DataFrame: + """ + Low-level function to create a DataFrame from arrays as they are + representing the block structure of the resulting DataFrame. + + Attention: this is an advanced, low-level function that should only be + used if you know that the below-mentioned assumptions are guaranteed. + If passing data that do not follow those assumptions, subsequent + subsequent operations on the resulting DataFrame might lead to strange + errors. + For almost all use cases, you should use the standard pd.DataFrame(..) + constructor instead. If you are planning to use this function, let us + know by opening an issue at https://github.com/pandas-dev/pandas/issues. + + Assumptions: + + - The block arrays are either a 2D numpy array or a pandas ExtensionArray + - In case of a numpy array, it is assumed to already be in the expected + shape for Blocks (2D, (cols, rows), i.e. transposed compared to the + DataFrame columns). + - All arrays are taken as is (no type inference) and expected to have the + correct size. + - The placement arrays have the correct length (equalling the number of + columns that its equivalent block array represents), and all placement + arrays together form a complete set of 0 to n_columns - 1. + + Parameters + ---------- + blocks : list of tuples of (block_array, block_placement) + This should be a list of tuples existing of (block_array, block_placement), + where: + + - block_array is a 2D numpy array or a 1D ExtensionArray, following the + requirements listed above. + - block_placement is a 1D integer numpy array + index : Index + The Index object for the `index` of the resulting DataFrame. + columns : Index + The Index object for the `columns` of the resulting DataFrame. + + Returns + ------- + DataFrame + """ + block_objs = [_make_block(*block) for block in blocks] + axes = [columns, index] + mgr = _BlockManager(block_objs, axes) + return DataFrame._from_mgr(mgr, mgr.axes) diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index 9b5d2cb06b523..df6392bf692a2 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -9,6 +9,7 @@ DataFrameGroupBy, SeriesGroupBy, ) +from pandas.core.indexes.frozen import FrozenList from pandas.core.resample import ( DatetimeIndexResamplerGroupby, PeriodIndexResamplerGroupby, @@ -38,6 +39,7 @@ "ExpandingGroupby", "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", + "FrozenList", "JsonReader", "NaTType", "NAType", diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index bcf295fd6b490..b5c1c98da1c78 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -35,20 +35,3 @@ "StringArray", "TimedeltaArray", ] - - -def __getattr__(name: str) -> type[NumpyExtensionArray]: - if name == "PandasArray": - # GH#53694 - import warnings - - from pandas.util._exceptions import find_stack_level - - warnings.warn( - "PandasArray has been renamed NumpyExtensionArray. Use that " - "instead. This alias will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return NumpyExtensionArray - raise AttributeError(f"module 'pandas.arrays' has no attribute '{name}'") diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 99b5053ce250c..aa2bf2f527bd8 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -231,7 +231,7 @@ def __get__(self, obj, cls): return accessor_obj -@doc(klass="", others="") +@doc(klass="", examples="", others="") def _register_accessor(name: str, cls): """ Register a custom accessor on {klass} objects. @@ -255,51 +255,26 @@ def _register_accessor(name: str, cls): Notes ----- - When accessed, your accessor will be initialized with the pandas object - the user is interacting with. So the signature must be + This function allows you to register a custom-defined accessor class for {klass}. + The requirements for the accessor class are as follows: - .. code-block:: python + * Must contain an init method that: - def __init__(self, pandas_object): # noqa: E999 - ... + * accepts a single {klass} object - For consistency with pandas methods, you should raise an ``AttributeError`` - if the data passed to your accessor has an incorrect dtype. + * raises an AttributeError if the {klass} object does not have correctly + matching inputs for the accessor - >>> pd.Series(["a", "b"]).dt - Traceback (most recent call last): - ... - AttributeError: Can only use .dt accessor with datetimelike values + * Must contain a method for each access pattern. - Examples - -------- - In your library code:: - - @pd.api.extensions.register_dataframe_accessor("geo") - class GeoAccessor: - def __init__(self, pandas_obj): - self._obj = pandas_obj - - @property - def center(self): - # return the geographic center point of this DataFrame - lat = self._obj.latitude - lon = self._obj.longitude - return (float(lon.mean()), float(lat.mean())) + * The methods should be able to take any argument signature. - def plot(self): - # plot this array's data on a map, e.g., using Cartopy - pass + * Accessible using the @property decorator if no additional arguments are + needed. - Back in an interactive IPython session: - - .. code-block:: ipython - - In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10), - ...: "latitude": np.linspace(0, 20)}}) - In [2]: ds.geo.center - Out[2]: (5.0, 10.0) - In [3]: ds.geo.plot() # plots data on a map + Examples + -------- + {examples} """ def decorator(accessor): @@ -318,21 +293,98 @@ def decorator(accessor): return decorator -@doc(_register_accessor, klass="DataFrame") +_register_df_examples = """ +An accessor that only accepts integers could +have a class defined like this: + +>>> @pd.api.extensions.register_dataframe_accessor("int_accessor") +... class IntAccessor: +... def __init__(self, pandas_obj): +... if not all(pandas_obj[col].dtype == 'int64' for col in pandas_obj.columns): +... raise AttributeError("All columns must contain integer values only") +... self._obj = pandas_obj +... +... def sum(self): +... return self._obj.sum() +... +>>> df = pd.DataFrame([[1, 2], ['x', 'y']]) +>>> df.int_accessor +Traceback (most recent call last): +... +AttributeError: All columns must contain integer values only. +>>> df = pd.DataFrame([[1, 2], [3, 4]]) +>>> df.int_accessor.sum() +0 4 +1 6 +dtype: int64""" + + +@doc(_register_accessor, klass="DataFrame", examples=_register_df_examples) def register_dataframe_accessor(name: str): from pandas import DataFrame return _register_accessor(name, DataFrame) -@doc(_register_accessor, klass="Series") +_register_series_examples = """ +An accessor that only accepts integers could +have a class defined like this: + +>>> @pd.api.extensions.register_series_accessor("int_accessor") +... class IntAccessor: +... def __init__(self, pandas_obj): +... if not pandas_obj.dtype == 'int64': +... raise AttributeError("The series must contain integer data only") +... self._obj = pandas_obj +... +... def sum(self): +... return self._obj.sum() +... +>>> df = pd.Series([1, 2, 'x']) +>>> df.int_accessor +Traceback (most recent call last): +... +AttributeError: The series must contain integer data only. +>>> df = pd.Series([1, 2, 3]) +>>> df.int_accessor.sum() +6""" + + +@doc(_register_accessor, klass="Series", examples=_register_series_examples) def register_series_accessor(name: str): from pandas import Series return _register_accessor(name, Series) -@doc(_register_accessor, klass="Index") +_register_index_examples = """ +An accessor that only accepts integers could +have a class defined like this: + +>>> @pd.api.extensions.register_index_accessor("int_accessor") +... class IntAccessor: +... def __init__(self, pandas_obj): +... if not all(isinstance(x, int) for x in pandas_obj): +... raise AttributeError("The index must only be an integer value") +... self._obj = pandas_obj +... +... def even(self): +... return [x for x in self._obj if x % 2 == 0] +>>> df = pd.DataFrame.from_dict( +... {"row1": {"1": 1, "2": "a"}, "row2": {"1": 2, "2": "b"}}, orient="index" +... ) +>>> df.index.int_accessor +Traceback (most recent call last): +... +AttributeError: The index must only be an integer value. +>>> df = pd.DataFrame( +... {"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]}, index=[1, 2, 5, 8] +... ) +>>> df.index.int_accessor.even() +[2, 8]""" + + +@doc(_register_accessor, klass="Index", examples=_register_index_examples) def register_index_accessor(name: str): from pandas import Index diff --git a/pandas/core/api.py b/pandas/core/api.py index 3d2e855831c05..c8a4e9d8a23b2 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -41,7 +41,7 @@ UInt64Dtype, ) from pandas.core.arrays.string_ import StringDtype -from pandas.core.construction import array +from pandas.core.construction import array # noqa: ICN001 from pandas.core.flags import Flags from pandas.core.groupby import ( Grouper, diff --git a/pandas/core/apply.py b/pandas/core/apply.py index e8df24850f7a8..832beeddcef3c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1710,9 +1710,9 @@ def normalize_keyword_aggregation( # TODO: aggspec type: typing.Dict[str, List[AggScalar]] aggspec = defaultdict(list) order = [] - columns, pairs = list(zip(*kwargs.items())) + columns = tuple(kwargs.keys()) - for column, aggfunc in pairs: + for column, aggfunc in kwargs.values(): aggspec[column].append(aggfunc) order.append((column, com.get_callable_name(aggfunc) or aggfunc)) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 7f4e6f6666382..cbd0221cc2082 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -210,7 +210,7 @@ def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[overri # override base class by adding axis keyword validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: - raise NotImplementedError + raise ValueError("Encountered an NA value with skipna=False") return nargminmax(self, "argmin", axis=axis) # Signature of "argmax" incompatible with supertype "ExtensionArray" @@ -218,7 +218,7 @@ def argmax(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[overri # override base class by adding axis keyword validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: - raise NotImplementedError + raise ValueError("Encountered an NA value with skipna=False") return nargminmax(self, "argmax", axis=axis) def unique(self) -> Self: @@ -296,13 +296,6 @@ def __getitem__( result = self._from_backing_data(result) return result - def _fill_mask_inplace( - self, method: str, limit: int | None, mask: npt.NDArray[np.bool_] - ) -> None: - # (for now) when self.ndim == 2, we assume axis=0 - func = missing.get_fill_func(method, ndim=self.ndim) - func(self._ndarray.T, limit=limit, mask=mask.T) - def _pad_or_backfill( self, *, @@ -335,7 +328,7 @@ def _pad_or_backfill( return new_values @doc(ExtensionArray.fillna) - def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self: + def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self.isna() # error: Argument 2 to "check_value_size" has incompatible type # "ExtensionArray"; expected "ndarray" @@ -354,8 +347,7 @@ def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Sel new_values[mask] = value else: # We validate the fill_value even if there is nothing to fill - if value is not None: - self._validate_setitem_value(value) + self._validate_setitem_value(value) if not copy: new_values = self[:] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 84b62563605ac..1154130b9bed3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1077,7 +1077,7 @@ def _pad_or_backfill( @doc(ExtensionArray.fillna) def fillna( self, - value: object | ArrayLike | None = None, + value: object | ArrayLike, limit: int | None = None, copy: bool = True, ) -> Self: @@ -1697,7 +1697,7 @@ def pyarrow_meth(data, skip_nulls, **kwargs): except (AttributeError, NotImplementedError, TypeError) as err: msg = ( f"'{type(self).__name__}' with dtype {self.dtype} " - f"does not support reduction '{name}' with pyarrow " + f"does not support operation '{name}' with pyarrow " f"version {pa.__version__}. '{name}' may be supported by " f"upgrading pyarrow." ) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 76615704f2e33..8a2856d0a7e64 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -885,7 +885,7 @@ def argmin(self, skipna: bool = True) -> int: # 2. argmin itself : total control over sorting. validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: - raise NotImplementedError + raise ValueError("Encountered an NA value with skipna=False") return nargminmax(self, "argmin") def argmax(self, skipna: bool = True) -> int: @@ -919,7 +919,7 @@ def argmax(self, skipna: bool = True) -> int: # 2. argmax itself : total control over sorting. validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: - raise NotImplementedError + raise ValueError("Encountered an NA value with skipna=False") return nargminmax(self, "argmax") def interpolate( @@ -1886,7 +1886,7 @@ def _reduce( Raises ------ - TypeError : subclass does not define reductions + TypeError : subclass does not define operations Examples -------- @@ -1897,7 +1897,7 @@ def _reduce( if meth is None: raise TypeError( f"'{type(self).__name__}' with dtype {self.dtype} " - f"does not support reduction '{name}'" + f"does not support operation '{name}'" ) result = meth(skipna=skipna, **kwargs) if keepdims: @@ -2111,25 +2111,6 @@ def _where(self, mask: npt.NDArray[np.bool_], value) -> Self: result[~mask] = val return result - # TODO(3.0): this can be removed once GH#33302 deprecation is enforced - def _fill_mask_inplace( - self, method: str, limit: int | None, mask: npt.NDArray[np.bool_] - ) -> None: - """ - Replace values in locations specified by 'mask' using pad or backfill. - - See also - -------- - ExtensionArray.fillna - """ - func = missing.get_fill_func(method) - npvalues = self.astype(object) - # NB: if we don't copy mask here, it may be altered inplace, which - # would mess up the `self[mask] = ...` below. - func(npvalues, limit=limit, mask=mask.copy()) - new_values = self._from_sequence(npvalues, dtype=self.dtype) - self[mask] = new_values[mask] - def _rank( self, *, diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 416331a260e9f..8d6880fc2acb3 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -276,9 +276,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi provided). dtype : CategoricalDtype An instance of ``CategoricalDtype`` to use for this categorical. - fastpath : bool - The 'fastpath' keyword in Categorical is deprecated and will be - removed in a future version. Use Categorical.from_codes instead. copy : bool, default True Whether to copy if the codes are unchanged. @@ -391,20 +388,8 @@ def __init__( categories=None, ordered=None, dtype: Dtype | None = None, - fastpath: bool | lib.NoDefault = lib.no_default, copy: bool = True, ) -> None: - if fastpath is not lib.no_default: - # GH#20110 - warnings.warn( - "The 'fastpath' keyword in Categorical is deprecated and will " - "be removed in a future version. Use Categorical.from_codes instead", - DeprecationWarning, - stacklevel=find_stack_level(), - ) - else: - fastpath = False - dtype = CategoricalDtype._from_values_or_dtype( values, categories, ordered, dtype ) @@ -412,12 +397,6 @@ def __init__( # we may have dtype.categories be None, and we need to # infer categories in a factorization step further below - if fastpath: - codes = coerce_indexer_dtype(values, dtype.categories) - dtype = CategoricalDtype(ordered=False).update_dtype(dtype) - super().__init__(codes, dtype) - return - if not is_list_like(values): # GH#38433 raise TypeError("Categorical input must be list-like") @@ -2503,7 +2482,6 @@ def unique(self) -> Self: ['b', 'a'] Categories (3, object): ['a' < 'b' < 'c'] """ - # pylint: disable=useless-parent-delegation return super().unique() def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 52cb175ca79a2..8ada9d88e08bc 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -506,7 +506,6 @@ def view(self, dtype: Literal["m8[ns]"]) -> TimedeltaArray: ... @overload def view(self, dtype: Dtype | None = ...) -> ArrayLike: ... - # pylint: disable-next=useless-parent-delegation def view(self, dtype: Dtype | None = None) -> ArrayLike: # we need to explicitly call super() method as long as the `@overload`s # are present in this file. @@ -1661,8 +1660,14 @@ def _groupby_op( dtype = self.dtype if dtype.kind == "M": # Adding/multiplying datetimes is not valid - if how in ["any", "all", "sum", "prod", "cumsum", "cumprod", "var", "skew"]: - raise TypeError(f"datetime64 type does not support operation: '{how}'") + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + raise TypeError(f"datetime64 type does not support operation '{how}'") + if how in ["any", "all"]: + # GH#34479 + raise TypeError( + f"'{how}' with datetime64 dtypes is no longer supported. " + f"Use (obj != pd.Timestamp(0)).{how}() instead." + ) elif isinstance(dtype, PeriodDtype): # Adding/multiplying Periods is not valid @@ -1670,11 +1675,9 @@ def _groupby_op( raise TypeError(f"Period type does not support {how} operations") if how in ["any", "all"]: # GH#34479 - warnings.warn( - f"'{how}' with PeriodDtype is deprecated and will raise in a " - f"future version. Use (obj != pd.Period(0, freq)).{how}() instead.", - FutureWarning, - stacklevel=find_stack_level(), + raise TypeError( + f"'{how}' with PeriodDtype is no longer supported. " + f"Use (obj != pd.Period(0, freq)).{how}() instead." ) else: # timedeltas we can add but not multiply @@ -1784,7 +1787,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: ---------- freq : str or Offset The frequency level to {op} the index to. Must be a fixed - frequency like 'S' (second) not 'ME' (month end). See + frequency like 's' (second) not 'ME' (month end). See :ref:`frequency aliases ` for a list of possible `freq` values. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' @@ -2424,17 +2427,17 @@ def validate_periods(periods: None) -> None: ... @overload -def validate_periods(periods: int | float) -> int: ... +def validate_periods(periods: int) -> int: ... -def validate_periods(periods: int | float | None) -> int | None: +def validate_periods(periods: int | None) -> int | None: """ If a `periods` argument is passed to the Datetime/Timedelta Array/Index constructor, cast it to an integer. Parameters ---------- - periods : None, float, int + periods : None, int Returns ------- @@ -2443,22 +2446,13 @@ def validate_periods(periods: int | float | None) -> int | None: Raises ------ TypeError - if periods is None, float, or int + if periods is not None or int """ - if periods is not None: - if lib.is_float(periods): - warnings.warn( - # GH#56036 - "Non-integer 'periods' in pd.date_range, pd.timedelta_range, " - "pd.period_range, and pd.interval_range are deprecated and " - "will raise in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - periods = int(periods) - elif not lib.is_integer(periods): - raise TypeError(f"periods must be a number, got {periods}") - return periods + if periods is not None and not lib.is_integer(periods): + raise TypeError(f"periods must be an integer, got {periods}") + # error: Incompatible return value type (got "int | integer[Any] | None", + # expected "int | None") + return periods # type: ignore[return-value] def _validate_inferred_freq( diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 74b8cfb65cbc7..653e63e9d1e2d 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -135,6 +135,12 @@ class FloatingArray(NumericArray): ------- None +See Also +-------- +CategoricalDtype : Type for categorical data with the categories and orderedness. +IntegerDtype : An ExtensionDtype to hold a single size & kind of integer dtype. +StringDtype : An ExtensionDtype for string data. + Examples -------- For Float32Dtype: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index af666a591b1bc..86f58b48ea3be 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -892,7 +892,7 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr indexer = obj.argsort()[-1] return obj[indexer] - def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self: + def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: """ Fill NA/NaN values using the specified method. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d20d7f98b8aa8..190888d281ea9 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -236,7 +236,7 @@ def _pad_or_backfill( return new_values @doc(ExtensionArray.fillna) - def fillna(self, value=None, limit: int | None = None, copy: bool = True) -> Self: + def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: mask = self._mask value = missing.check_value_size(value, mask, len(self)) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index bdcb3219a9875..522d86fb165f6 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -40,7 +40,6 @@ from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.cast import ( - construct_1d_arraylike_from_scalar, find_common_type, maybe_box_datetimelike, ) @@ -399,19 +398,10 @@ def __init__( dtype = dtype.subtype if is_scalar(data): - warnings.warn( - f"Constructing {type(self).__name__} with scalar data is deprecated " - "and will raise in a future version. Pass a sequence instead.", - FutureWarning, - stacklevel=find_stack_level(), + raise TypeError( + f"Cannot construct {type(self).__name__} from scalar data. " + "Pass a sequence instead." ) - if sparse_index is None: - npoints = 1 - else: - npoints = sparse_index.length - - data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None) - dtype = data.dtype if dtype is not None: dtype = pandas_dtype(dtype) @@ -716,7 +706,7 @@ def isna(self) -> Self: # type: ignore[override] def fillna( self, - value=None, + value, limit: int | None = None, copy: bool = True, ) -> Self: @@ -746,8 +736,6 @@ def fillna( When ``self.fill_value`` is not NA, the result dtype will be ``self.dtype``. Again, this preserves the amount of memory used. """ - if value is None: - raise ValueError("Must specify 'value'.") new_values = np.where(isna(self.sp_values), value, self.sp_values) if self._null_fill_value: @@ -1623,13 +1611,13 @@ def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int: def argmax(self, skipna: bool = True) -> int: validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: - raise NotImplementedError + raise ValueError("Encountered an NA value with skipna=False") return self._argmin_argmax("argmax") def argmin(self, skipna: bool = True) -> int: validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: - raise NotImplementedError + raise ValueError("Encountered an NA value with skipna=False") return self._argmin_argmax("argmin") # ------------------------------------------------------------------------ diff --git a/pandas/core/base.py b/pandas/core/base.py index 263265701691b..95b203590b393 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -127,7 +127,7 @@ def __sizeof__(self) -> int: """ memory_usage = getattr(self, "memory_usage", None) if memory_usage: - mem = memory_usage(deep=True) # pylint: disable=not-callable + mem = memory_usage(deep=True) return int(mem if is_scalar(mem) else mem.sum()) # no memory_usage attribute, so fall back to object's 'sizeof' @@ -735,13 +735,8 @@ def argmax( nv.validate_minmax_axis(axis) skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) - if skipna and len(delegate) > 0 and isna(delegate).all(): - raise ValueError("Encountered all NA values") - elif not skipna and isna(delegate).any(): - raise ValueError("Encountered an NA value with skipna=False") - if isinstance(delegate, ExtensionArray): - return delegate.argmax() + return delegate.argmax(skipna=skipna) else: result = nanops.nanargmax(delegate, skipna=skipna) # error: Incompatible return value type (got "Union[int, ndarray]", expected @@ -754,15 +749,10 @@ def argmin( ) -> int: delegate = self._values nv.validate_minmax_axis(axis) - skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs) - - if skipna and len(delegate) > 0 and isna(delegate).all(): - raise ValueError("Encountered all NA values") - elif not skipna and isna(delegate).any(): - raise ValueError("Encountered an NA value with skipna=False") + skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) if isinstance(delegate, ExtensionArray): - return delegate.argmin() + return delegate.argmin(skipna=skipna) else: result = nanops.nanargmin(delegate, skipna=skipna) # error: Incompatible return value type (got "Union[int, ndarray]", expected @@ -924,6 +914,7 @@ def value_counts( Returns ------- Series + Series containing counts of unique values. See Also -------- diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index cd9aa1833d586..7d8e23abf43b6 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -115,7 +115,7 @@ def _resolve_name(self): res = self.env.resolve(local_name, is_local=is_local) self.update(res) - if hasattr(res, "ndim") and res.ndim > 2: + if hasattr(res, "ndim") and isinstance(res.ndim, int) and res.ndim > 2: raise NotImplementedError( "N-dimensional objects, where N > 2, are not supported with eval" ) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index aa621fea6c39a..4d8d3c2816f69 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -250,7 +250,7 @@ def is_scipy_sparse(arr) -> bool: """ global _is_scipy_sparse - if _is_scipy_sparse is None: # pylint: disable=used-before-assignment + if _is_scipy_sparse is None: try: from scipy.sparse import issparse as _is_scipy_sparse except ImportError: diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index f94d32a3b8547..98e689528744e 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -698,8 +698,8 @@ class DatetimeTZDtype(PandasExtensionDtype): Parameters ---------- unit : str, default "ns" - The precision of the datetime data. Currently limited - to ``"ns"``. + The precision of the datetime data. Valid options are + ``"s"``, ``"ms"``, ``"us"``, ``"ns"``. tz : str, int, or datetime.tzinfo The timezone. @@ -1762,24 +1762,18 @@ def _check_fill_value(self) -> None: val = self._fill_value if isna(val): if not is_valid_na_for_dtype(val, self.subtype): - warnings.warn( - "Allowing arbitrary scalar fill_value in SparseDtype is " - "deprecated. In a future version, the fill_value must be " - "a valid value for the SparseDtype.subtype.", - FutureWarning, - stacklevel=find_stack_level(), + raise ValueError( + # GH#53043 + "fill_value must be a valid value for the SparseDtype.subtype" ) else: dummy = np.empty(0, dtype=self.subtype) dummy = ensure_wrapped_if_datetimelike(dummy) if not can_hold_element(dummy, val): - warnings.warn( - "Allowing arbitrary scalar fill_value in SparseDtype is " - "deprecated. In a future version, the fill_value must be " - "a valid value for the SparseDtype.subtype.", - FutureWarning, - stacklevel=find_stack_level(), + raise ValueError( + # GH#53043 + "fill_value must be a valid value for the SparseDtype.subtype" ) @property diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 50a93994dc76b..b595e4d2158fc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -64,6 +64,7 @@ from pandas.util._decorators import ( Appender, Substitution, + deprecate_nonkeyword_arguments, doc, set_module, ) @@ -994,6 +995,11 @@ def axes(self) -> list[Index]: It has the row axis labels and column axis labels as the only members. They are returned in that order. + See Also + -------- + DataFrame.index: The index (row labels) of the DataFrame. + DataFrame.columns: The column labels of the DataFrame. + Examples -------- >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) @@ -2295,8 +2301,8 @@ def maybe_reorder( exclude.update(index) if any(exclude): - arr_exclude = [x for x in exclude if x in arr_columns] - to_remove = [arr_columns.get_loc(col) for col in arr_exclude] + arr_exclude = (x for x in exclude if x in arr_columns) + to_remove = {arr_columns.get_loc(col) for col in arr_exclude} arrays = [v for i, v in enumerate(arrays) if i not in to_remove] columns = columns.drop(exclude) @@ -3699,7 +3705,7 @@ def transpose( nv.validate_transpose(args, {}) # construct the args - dtypes = list(self.dtypes) + first_dtype = self.dtypes.iloc[0] if len(self.columns) else None if self._can_fast_transpose: # Note: tests pass without this, but this improves perf quite a bit. @@ -3717,11 +3723,11 @@ def transpose( elif ( self._is_homogeneous_type - and dtypes - and isinstance(dtypes[0], ExtensionDtype) + and first_dtype is not None + and isinstance(first_dtype, ExtensionDtype) ): new_values: list - if isinstance(dtypes[0], BaseMaskedDtype): + if isinstance(first_dtype, BaseMaskedDtype): # We have masked arrays with the same dtype. We can transpose faster. from pandas.core.arrays.masked import ( transpose_homogeneous_masked_arrays, @@ -3730,7 +3736,7 @@ def transpose( new_values = transpose_homogeneous_masked_arrays( cast(Sequence[BaseMaskedArray], self._iter_column_arrays()) ) - elif isinstance(dtypes[0], ArrowDtype): + elif isinstance(first_dtype, ArrowDtype): # We have arrow EAs with the same dtype. We can transpose faster. from pandas.core.arrays.arrow.array import ( ArrowExtensionArray, @@ -3742,10 +3748,11 @@ def transpose( ) else: # We have other EAs with the same dtype. We preserve dtype in transpose. - dtyp = dtypes[0] - arr_typ = dtyp.construct_array_type() + arr_typ = first_dtype.construct_array_type() values = self.values - new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values] + new_values = [ + arr_typ._from_sequence(row, dtype=first_dtype) for row in values + ] result = type(self)._from_arrays( new_values, @@ -3849,8 +3856,10 @@ def __getitem__(self, key): key = lib.item_from_zerodim(key) key = com.apply_if_callable(key, self) - if is_hashable(key) and not is_iterator(key): + if is_hashable(key) and not is_iterator(key) and not isinstance(key, slice): # is_iterator to exclude generator e.g. test_getitem_listlike + # As of Python 3.12, slice is hashable which breaks MultiIndex (GH#57500) + # shortcut if the key is in columns is_mi = isinstance(self.columns, MultiIndex) # GH#45316 Return view if key is not duplicated @@ -4915,6 +4924,11 @@ def assign(self, **kwargs) -> DataFrame: A new DataFrame with the new columns in addition to all the existing columns. + See Also + -------- + DataFrame.loc : Select a subset of a DataFrame by labels. + DataFrame.iloc : Select a subset of a DataFrame by positions. + Notes ----- Assigning multiple columns within the same ``assign`` is possible. @@ -5874,7 +5888,7 @@ def set_index( else: arrays.append(self.index) - to_remove: list[Hashable] = [] + to_remove: set[Hashable] = set() for col in keys: if isinstance(col, MultiIndex): arrays.extend(col._get_level_values(n) for n in range(col.nlevels)) @@ -5901,7 +5915,7 @@ def set_index( arrays.append(frame[col]) names.append(col) if drop: - to_remove.append(col) + to_remove.add(col) if len(arrays[-1]) != len(self): # check newest element against length of calling frame, since @@ -5918,7 +5932,7 @@ def set_index( raise ValueError(f"Index has duplicate keys: {duplicates}") # use set to handle duplicate column names gracefully in case of drop - for c in set(to_remove): + for c in to_remove: del frame[c] # clear up memory usage @@ -6167,12 +6181,13 @@ class max type names = self.index._get_default_index_names(names, default) if isinstance(self.index, MultiIndex): - to_insert = zip(self.index.levels, self.index.codes) + to_insert = zip(reversed(self.index.levels), reversed(self.index.codes)) else: to_insert = ((self.index, None),) multi_col = isinstance(self.columns, MultiIndex) - for i, (lev, lab) in reversed(list(enumerate(to_insert))): + for j, (lev, lab) in enumerate(to_insert, start=1): + i = self.index.nlevels - j if level is not None and i not in level: continue name = names[i] @@ -7162,7 +7177,7 @@ def value_counts( dropna: bool = True, ) -> Series: """ - Return a Series containing the frequency of each distinct row in the Dataframe. + Return a Series containing the frequency of each distinct row in the DataFrame. Parameters ---------- @@ -7175,13 +7190,14 @@ def value_counts( ascending : bool, default False Sort in ascending order. dropna : bool, default True - Don't include counts of rows that contain NA values. + Do not include counts of rows that contain NA values. .. versionadded:: 1.3.0 Returns ------- Series + Series containing the frequency of each distinct row in the DataFrame. See Also -------- @@ -7192,8 +7208,8 @@ def value_counts( The returned Series will have a MultiIndex with one level per input column but an Index (non-multi) for a single label. By default, rows that contain any NA values are omitted from the result. By default, - the resulting Series will be in descending order so that the first - element is the most frequently-occurring row. + the resulting Series will be sorted by frequencies in descending order so that + the first element is the most frequently-occurring row. Examples -------- @@ -9658,6 +9674,8 @@ def unstack( Returns ------- Series or DataFrame + If index is a MultiIndex: DataFrame with pivoted index labels as new + inner-most level column labels, else Series. See Also -------- @@ -11494,7 +11512,7 @@ def any( **kwargs, ) -> Series | bool: ... - @doc(make_doc("any", ndim=2)) + @doc(make_doc("any", ndim=1)) def any( self, *, @@ -11540,7 +11558,8 @@ def all( **kwargs, ) -> Series | bool: ... - @doc(make_doc("all", ndim=2)) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="all") + @doc(make_doc("all", ndim=1)) def all( self, axis: Axis | None = 0, @@ -11586,6 +11605,7 @@ def min( **kwargs, ) -> Series | Any: ... + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="min") @doc(make_doc("min", ndim=2)) def min( self, @@ -11632,6 +11652,7 @@ def max( **kwargs, ) -> Series | Any: ... + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="max") @doc(make_doc("max", ndim=2)) def max( self, @@ -11647,6 +11668,7 @@ def max( result = result.__finalize__(self, method="max") return result + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sum") @doc(make_doc("sum", ndim=2)) def sum( self, @@ -11667,6 +11689,7 @@ def sum( result = result.__finalize__(self, method="sum") return result + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="prod") @doc(make_doc("prod", ndim=2)) def prod( self, @@ -11718,6 +11741,7 @@ def mean( **kwargs, ) -> Series | Any: ... + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="mean") @doc(make_doc("mean", ndim=2)) def mean( self, @@ -11764,6 +11788,7 @@ def median( **kwargs, ) -> Series | Any: ... + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="median") @doc(make_doc("median", ndim=2)) def median( self, @@ -11813,6 +11838,7 @@ def sem( **kwargs, ) -> Series | Any: ... + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sem") @doc(make_doc("sem", ndim=2)) def sem( self, @@ -11863,6 +11889,7 @@ def var( **kwargs, ) -> Series | Any: ... + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="var") @doc(make_doc("var", ndim=2)) def var( self, @@ -11913,6 +11940,7 @@ def std( **kwargs, ) -> Series | Any: ... + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="std") @doc(make_doc("std", ndim=2)) def std( self, @@ -11960,6 +11988,7 @@ def skew( **kwargs, ) -> Series | Any: ... + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="skew") @doc(make_doc("skew", ndim=2)) def skew( self, @@ -12006,6 +12035,7 @@ def kurt( **kwargs, ) -> Series | Any: ... + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="kurt") @doc(make_doc("kurt", ndim=2)) def kurt( self, @@ -12026,20 +12056,52 @@ def kurt( product = prod @doc(make_doc("cummin", ndim=2)) - def cummin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self: - return NDFrame.cummin(self, axis, skipna, *args, **kwargs) + def cummin( + self, + axis: Axis = 0, + skipna: bool = True, + numeric_only: bool = False, + *args, + **kwargs, + ) -> Self: + data = self._get_numeric_data() if numeric_only else self + return NDFrame.cummin(data, axis, skipna, *args, **kwargs) @doc(make_doc("cummax", ndim=2)) - def cummax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self: - return NDFrame.cummax(self, axis, skipna, *args, **kwargs) + def cummax( + self, + axis: Axis = 0, + skipna: bool = True, + numeric_only: bool = False, + *args, + **kwargs, + ) -> Self: + data = self._get_numeric_data() if numeric_only else self + return NDFrame.cummax(data, axis, skipna, *args, **kwargs) @doc(make_doc("cumsum", ndim=2)) - def cumsum(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self: - return NDFrame.cumsum(self, axis, skipna, *args, **kwargs) + def cumsum( + self, + axis: Axis = 0, + skipna: bool = True, + numeric_only: bool = False, + *args, + **kwargs, + ) -> Self: + data = self._get_numeric_data() if numeric_only else self + return NDFrame.cumsum(data, axis, skipna, *args, **kwargs) @doc(make_doc("cumprod", 2)) - def cumprod(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self: - return NDFrame.cumprod(self, axis, skipna, *args, **kwargs) + def cumprod( + self, + axis: Axis = 0, + skipna: bool = True, + numeric_only: bool = False, + *args, + **kwargs, + ) -> Self: + data = self._get_numeric_data() if numeric_only else self + return NDFrame.cumprod(data, axis, skipna, *args, **kwargs) def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: """ @@ -12479,7 +12541,9 @@ def to_timestamp( copy: bool | lib.NoDefault = lib.no_default, ) -> DataFrame: """ - Cast to DatetimeIndex of timestamps, at *beginning* of period. + Cast PeriodIndex to DatetimeIndex of timestamps, at *beginning* of period. + + This can be changed to the *end* of the period, by specifying `how="e"`. Parameters ---------- @@ -12509,8 +12573,13 @@ def to_timestamp( Returns ------- - DataFrame - The DataFrame has a DatetimeIndex. + DataFrame with DatetimeIndex + DataFrame with the PeriodIndex cast to DatetimeIndex. + + See Also + -------- + DataFrame.to_period: Inverse method to cast DatetimeIndex to PeriodIndex. + Series.to_timestamp: Equivalent method for Series. Examples -------- @@ -12566,7 +12635,8 @@ def to_period( Convert DataFrame from DatetimeIndex to PeriodIndex. Convert DataFrame from DatetimeIndex to PeriodIndex with desired - frequency (inferred from index if not passed). + frequency (inferred from index if not passed). Either index of columns can be + converted, depending on `axis` argument. Parameters ---------- @@ -12594,7 +12664,12 @@ def to_period( Returns ------- DataFrame - The DataFrame has a PeriodIndex. + The DataFrame with the converted PeriodIndex. + + See Also + -------- + Series.to_period: Equivalent method for Series. + Series.dt.to_period: Convert DateTime column values. Examples -------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 858d2ba82a969..028492f5617bd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2045,7 +2045,7 @@ def __setstate__(self, state) -> None: # e.g. say fill_value needing _mgr to be # defined meta = set(self._internal_names + self._metadata) - for k in list(meta): + for k in meta: if k in state and k != "_flags": v = state[k] object.__setattr__(self, k, v) @@ -4203,6 +4203,11 @@ def get(self, key, default=None): same type as items contained in object Item for given key or ``default`` value, if key is not found. + See Also + -------- + DataFrame.get : Get item from object for given key (ex: DataFrame column). + Series.get : Get item from object for given key (ex: DataFrame column). + Examples -------- >>> df = pd.DataFrame( @@ -6122,6 +6127,10 @@ def dtypes(self): pandas.Series The data type of each column. + See Also + -------- + Series.dtypes : Return the dtype object of the underlying data. + Examples -------- >>> df = pd.DataFrame( @@ -6729,7 +6738,7 @@ def _pad_or_backfill( if axis == 1: if not self._mgr.is_single_block and inplace: - raise NotImplementedError() + raise NotImplementedError # e.g. test_align_fill_method result = self.T._pad_or_backfill( method=method, limit=limit, limit_area=limit_area @@ -6752,7 +6761,7 @@ def _pad_or_backfill( @overload def fillna( self, - value: Hashable | Mapping | Series | DataFrame = ..., + value: Hashable | Mapping | Series | DataFrame, *, axis: Axis | None = ..., inplace: Literal[False] = ..., @@ -6762,7 +6771,7 @@ def fillna( @overload def fillna( self, - value: Hashable | Mapping | Series | DataFrame = ..., + value: Hashable | Mapping | Series | DataFrame, *, axis: Axis | None = ..., inplace: Literal[True], @@ -6772,7 +6781,7 @@ def fillna( @overload def fillna( self, - value: Hashable | Mapping | Series | DataFrame = ..., + value: Hashable | Mapping | Series | DataFrame, *, axis: Axis | None = ..., inplace: bool = ..., @@ -6786,7 +6795,7 @@ def fillna( ) def fillna( self, - value: Hashable | Mapping | Series | DataFrame | None = None, + value: Hashable | Mapping | Series | DataFrame, *, axis: Axis | None = None, inplace: bool = False, @@ -6827,6 +6836,12 @@ def fillna( reindex : Conform object to new index. asfreq : Convert TimeSeries to specified frequency. + Notes + ----- + For non-object dtype, ``value=None`` will use the NA value of the dtype. + See more details in the :ref:`Filling missing data` + section. + Examples -------- >>> df = pd.DataFrame( @@ -6909,101 +6924,92 @@ def fillna( axis = 0 axis = self._get_axis_number(axis) - if value is None: - raise ValueError("Must specify a fill 'value'.") - else: - if self.ndim == 1: - if isinstance(value, (dict, ABCSeries)): - if not len(value): - # test_fillna_nonscalar - if inplace: - return None - return self.copy(deep=False) - from pandas import Series - - value = Series(value) - value = value.reindex(self.index) - value = value._values - elif not is_list_like(value): - pass - else: - raise TypeError( - '"value" parameter must be a scalar, dict ' - "or Series, but you passed a " - f'"{type(value).__name__}"' - ) + if self.ndim == 1: + if isinstance(value, (dict, ABCSeries)): + if not len(value): + # test_fillna_nonscalar + if inplace: + return None + return self.copy(deep=False) + from pandas import Series - new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace) + value = Series(value) + value = value.reindex(self.index) + value = value._values + elif not is_list_like(value): + pass + else: + raise TypeError( + '"value" parameter must be a scalar, dict ' + "or Series, but you passed a " + f'"{type(value).__name__}"' + ) - elif isinstance(value, (dict, ABCSeries)): - if axis == 1: - raise NotImplementedError( - "Currently only can fill " - "with dict/Series column " - "by column" - ) - result = self if inplace else self.copy(deep=False) - for k, v in value.items(): - if k not in result: - continue + new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace) + + elif isinstance(value, (dict, ABCSeries)): + if axis == 1: + raise NotImplementedError( + "Currently only can fill with dict/Series column by column" + ) + result = self if inplace else self.copy(deep=False) + for k, v in value.items(): + if k not in result: + continue - res_k = result[k].fillna(v, limit=limit) + res_k = result[k].fillna(v, limit=limit) - if not inplace: - result[k] = res_k + if not inplace: + result[k] = res_k + else: + # We can write into our existing column(s) iff dtype + # was preserved. + if isinstance(res_k, ABCSeries): + # i.e. 'k' only shows up once in self.columns + if res_k.dtype == result[k].dtype: + result.loc[:, k] = res_k + else: + # Different dtype -> no way to do inplace. + result[k] = res_k else: - # We can write into our existing column(s) iff dtype - # was preserved. - if isinstance(res_k, ABCSeries): - # i.e. 'k' only shows up once in self.columns - if res_k.dtype == result[k].dtype: - result.loc[:, k] = res_k + # see test_fillna_dict_inplace_nonunique_columns + locs = result.columns.get_loc(k) + if isinstance(locs, slice): + locs = np.arange(self.shape[1])[locs] + elif isinstance(locs, np.ndarray) and locs.dtype.kind == "b": + locs = locs.nonzero()[0] + elif not ( + isinstance(locs, np.ndarray) and locs.dtype.kind == "i" + ): + # Should never be reached, but let's cover our bases + raise NotImplementedError( + "Unexpected get_loc result, please report a bug at " + "https://github.com/pandas-dev/pandas" + ) + + for i, loc in enumerate(locs): + res_loc = res_k.iloc[:, i] + target = self.iloc[:, loc] + + if res_loc.dtype == target.dtype: + result.iloc[:, loc] = res_loc else: - # Different dtype -> no way to do inplace. - result[k] = res_k - else: - # see test_fillna_dict_inplace_nonunique_columns - locs = result.columns.get_loc(k) - if isinstance(locs, slice): - locs = np.arange(self.shape[1])[locs] - elif ( - isinstance(locs, np.ndarray) and locs.dtype.kind == "b" - ): - locs = locs.nonzero()[0] - elif not ( - isinstance(locs, np.ndarray) and locs.dtype.kind == "i" - ): - # Should never be reached, but let's cover our bases - raise NotImplementedError( - "Unexpected get_loc result, please report a bug at " - "https://github.com/pandas-dev/pandas" - ) - - for i, loc in enumerate(locs): - res_loc = res_k.iloc[:, i] - target = self.iloc[:, loc] - - if res_loc.dtype == target.dtype: - result.iloc[:, loc] = res_loc - else: - result.isetitem(loc, res_loc) - if inplace: - return self._update_inplace(result) - else: - return result + result.isetitem(loc, res_loc) + if inplace: + return self._update_inplace(result) + else: + return result - elif not is_list_like(value): - if axis == 1: - result = self.T.fillna(value=value, limit=limit).T - new_data = result._mgr - else: - new_data = self._mgr.fillna( - value=value, limit=limit, inplace=inplace - ) - elif isinstance(value, ABCDataFrame) and self.ndim == 2: - new_data = self.where(self.notna(), value)._mgr + elif not is_list_like(value): + if axis == 1: + result = self.T.fillna(value=value, limit=limit).T + new_data = result._mgr else: - raise ValueError(f"invalid fill value with a {type(value)}") + new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace) + elif isinstance(value, ABCDataFrame) and self.ndim == 2: + new_data = self.where(self.notna(), value)._mgr + else: + raise ValueError(f"invalid fill value with a {type(value)}") result = self._constructor_from_mgr(new_data, axes=new_data.axes) if inplace: @@ -7089,6 +7095,11 @@ def ffill( {klass} or None Object with missing values filled or None if ``inplace=True``. + See Also + -------- + DataFrame.bfill : Fill NA/NaN values by using the next valid observation + to fill the gap. + Examples -------- >>> df = pd.DataFrame( @@ -7217,6 +7228,11 @@ def bfill( {klass} or None Object with missing values filled or None if ``inplace=True``. + See Also + -------- + DataFrame.ffill : Fill NA/NaN values by propagating the last valid + observation to next valid. + Examples -------- For Series: @@ -7319,17 +7335,8 @@ def replace( inplace: bool = False, regex: bool = False, ) -> Self | None: - if value is lib.no_default and not is_dict_like(to_replace) and regex is False: - # case that goes through _replace_single and defaults to method="pad" - warnings.warn( - # GH#33302 - f"{type(self).__name__}.replace without 'value' and with " - "non-dict-like 'to_replace' is deprecated " - "and will raise in a future version. " - "Explicitly specify the new values instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) + if not is_bool(regex) and to_replace is not None: + raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") if not ( is_scalar(to_replace) @@ -7342,6 +7349,15 @@ def replace( f"{type(to_replace).__name__!r}" ) + if value is lib.no_default and not ( + is_dict_like(to_replace) or is_dict_like(regex) + ): + raise ValueError( + # GH#33302 + f"{type(self).__name__}.replace must specify either 'value', " + "a dict-like 'to_replace', or dict-like 'regex'." + ) + inplace = validate_bool_kwarg(inplace, "inplace") if inplace: if not PYPY: @@ -7352,41 +7368,10 @@ def replace( stacklevel=2, ) - if not is_bool(regex) and to_replace is not None: - raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") - if value is lib.no_default: - # GH#36984 if the user explicitly passes value=None we want to - # respect that. We have the corner case where the user explicitly - # passes value=None *and* a method, which we interpret as meaning - # they want the (documented) default behavior. - - # passing a single value that is scalar like - # when value is None (GH5319), for compat - if not is_dict_like(to_replace) and not is_dict_like(regex): - to_replace = [to_replace] - - if isinstance(to_replace, (tuple, list)): - # TODO: Consider copy-on-write for non-replaced columns's here - if isinstance(self, ABCDataFrame): - from pandas import Series - - result = self.apply( - Series._replace_single, - args=(to_replace, inplace), - ) - if inplace: - return None - return result - return self._replace_single(to_replace, inplace) - if not is_dict_like(to_replace): - if not is_dict_like(regex): - raise TypeError( - 'If "to_replace" and "value" are both None ' - 'and "to_replace" is not a list, then ' - "regex must be a mapping" - ) + # In this case we have checked above that + # 1) regex is dict-like and 2) to_replace is None to_replace = regex regex = True @@ -7749,11 +7734,6 @@ def interpolate( raise ValueError("'method' should be a string, not None.") obj, should_transpose = (self.T, True) if axis == 1 else (self, False) - # GH#53631 - if np.any(obj.dtypes == object): - raise TypeError( - f"{type(self).__name__} cannot interpolate with object dtype." - ) if isinstance(obj.index, MultiIndex) and method != "linear": raise ValueError( @@ -9807,8 +9787,10 @@ def where( Returns ------- - Series or DataFrame unless ``inplace=True`` in which case - returns None. + Series or DataFrame or None + When applied to a Series, the function will return a Series, + and when applied to a DataFrame, it will return a DataFrame; + if ``inplace=True``, it will return None. See Also -------- @@ -10423,6 +10405,11 @@ def tz_convert( TypeError If the axis is tz-naive. + See Also + -------- + DataFrame.tz_localize: Localize tz-naive index of DataFrame to target time zone. + Series.tz_localize: Localize tz-naive index of Series to target time zone. + Examples -------- Change to another time zone: @@ -10485,10 +10472,10 @@ def tz_localize( nonexistent: TimeNonexistent = "raise", ) -> Self: """ - Localize tz-naive index of a Series or DataFrame to target time zone. + Localize time zone naive index of a Series or DataFrame to target time zone. This operation localizes the Index. To localize the values in a - timezone-naive Series, use :meth:`Series.dt.tz_localize`. + time zone naive Series, use :meth:`Series.dt.tz_localize`. Parameters ---------- @@ -10548,13 +10535,19 @@ def tz_localize( Returns ------- {klass} - Same type as the input. + Same type as the input, with time zone naive or aware index, depending on + ``tz``. Raises ------ TypeError If the TimeSeries is tz-aware and tz is not None. + See Also + -------- + Series.dt.tz_localize: Localize the values in a time zone naive Series. + Timestamp.tz_localize: Localize the Timestamp to a timezone. + Examples -------- Localize local times: @@ -11712,7 +11705,7 @@ def last_valid_index(self) -> Hashable: skipna : bool, default True Exclude NA/null values when computing the result. numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. + Include only float, int, boolean columns. {min_count}\ **kwargs @@ -11881,9 +11874,9 @@ def last_valid_index(self) -> Hashable: Returns ------- -{name1} or {name2} - If level is specified, then, {name2} is returned; otherwise, {name1} - is returned. +{name2} or {name1} + If axis=None, then a scalar boolean is returned. + Otherwise a Series is returned with index matching the index argument. {see_also} {examples}""" @@ -11948,7 +11941,45 @@ def last_valid_index(self) -> Hashable: DataFrame.any : Return True if one (or more) elements are True. """ -_cnum_doc = """ +_cnum_pd_doc = """ +Return cumulative {desc} over a DataFrame or Series axis. + +Returns a DataFrame or Series of the same size containing the cumulative +{desc}. + +Parameters +---------- +axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The index or the name of the axis. 0 is equivalent to None or 'index'. + For `Series` this parameter is unused and defaults to 0. +skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. +numeric_only : bool, default False + Include only float, int, boolean columns. +*args, **kwargs + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + +Returns +------- +{name1} or {name2} + Return cumulative {desc} of {name1} or {name2}. + +See Also +-------- +core.window.expanding.Expanding.{accum_func_name} : Similar functionality + but ignores ``NaN`` values. +{name2}.{accum_func_name} : Return the {desc} over + {name2} axis. +{name2}.cummax : Return cumulative maximum over {name2} axis. +{name2}.cummin : Return cumulative minimum over {name2} axis. +{name2}.cumsum : Return cumulative sum over {name2} axis. +{name2}.cumprod : Return cumulative product over {name2} axis. + +{examples}""" + +_cnum_series_doc = """ Return cumulative {desc} over a DataFrame or Series axis. Returns a DataFrame or Series of the same size containing the cumulative @@ -12739,28 +12770,44 @@ def make_doc(name: str, ndim: int) -> str: kwargs = {"min_count": ""} elif name == "cumsum": - base_doc = _cnum_doc + if ndim == 1: + base_doc = _cnum_series_doc + else: + base_doc = _cnum_pd_doc + desc = "sum" see_also = "" examples = _cumsum_examples kwargs = {"accum_func_name": "sum"} elif name == "cumprod": - base_doc = _cnum_doc + if ndim == 1: + base_doc = _cnum_series_doc + else: + base_doc = _cnum_pd_doc + desc = "product" see_also = "" examples = _cumprod_examples kwargs = {"accum_func_name": "prod"} elif name == "cummin": - base_doc = _cnum_doc + if ndim == 1: + base_doc = _cnum_series_doc + else: + base_doc = _cnum_pd_doc + desc = "minimum" see_also = "" examples = _cummin_examples kwargs = {"accum_func_name": "min"} elif name == "cummax": - base_doc = _cnum_doc + if ndim == 1: + base_doc = _cnum_series_doc + else: + base_doc = _cnum_pd_doc + desc = "maximum" see_also = "" examples = _cummax_examples diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 361e9e87fadb8..0a048d11d0b4d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -240,6 +240,7 @@ def apply(self, func, *args, **kwargs) -> Series: Returns ------- Series or DataFrame + A pandas object with the result of applying ``func`` to each group. See Also -------- @@ -600,6 +601,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): Returns ------- Series + The filtered subset of the original Series. Notes ----- @@ -1078,6 +1080,7 @@ def skew( Returns ------- Series + Unbiased skew within groups. See Also -------- @@ -1941,6 +1944,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame: Returns ------- DataFrame + The filtered subset of the original DataFrame. Notes ----- @@ -2108,6 +2112,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: Returns ------- nunique: DataFrame + Counts of unique elements in each position. Examples -------- @@ -2506,6 +2511,7 @@ def skew( Returns ------- DataFrame + Unbiased skew within groups. See Also -------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bd8e222831d0c..bc37405b25a16 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -333,6 +333,8 @@ class providing the base-class of operations. Returns ------- %(klass)s + %(klass)s with the same indexes as the original object filled + with transformed values. See Also -------- @@ -1551,6 +1553,7 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: Returns ------- Series or DataFrame + A pandas object with the result of applying ``func`` to each group. See Also -------- @@ -2245,6 +2248,7 @@ def mean( Returns ------- pandas.Series or pandas.DataFrame + Mean of values within each group. Same object type as the caller. %(see_also)s Examples -------- @@ -3512,11 +3516,8 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Returns ------- - pandas.api.typing.DatetimeIndexResamplerGroupby, - pandas.api.typing.PeriodIndexResamplerGroupby, or - pandas.api.typing.TimedeltaIndexResamplerGroupby - Return a new groupby object, with type depending on the data - being resampled. + DatetimeIndexResampler, PeriodIndexResampler or TimdeltaResampler + Resampler object for the type of the index. See Also -------- @@ -4591,7 +4592,8 @@ def rank( Returns ------- - DataFrame with ranking of values within each group + DataFrame + The ranking of values within each group. %(see_also)s Examples -------- @@ -4663,6 +4665,7 @@ def cumprod(self, *args, **kwargs) -> NDFrameT: Returns ------- Series or DataFrame + Cumulative product for each group. Same object type as the caller. %(see_also)s Examples -------- @@ -4721,6 +4724,7 @@ def cumsum(self, *args, **kwargs) -> NDFrameT: Returns ------- Series or DataFrame + Cumulative sum for each group. Same object type as the caller. %(see_also)s Examples -------- @@ -4783,6 +4787,7 @@ def cummin( Returns ------- Series or DataFrame + Cumulative min for each group. Same object type as the caller. %(see_also)s Examples -------- @@ -4853,6 +4858,7 @@ def cummax( Returns ------- Series or DataFrame + Cumulative max for each group. Same object type as the caller. %(see_also)s Examples -------- @@ -5008,7 +5014,7 @@ def shift( period = cast(int, period) if freq is not None: f = lambda x: x.shift( - period, # pylint: disable=cell-var-from-loop + period, freq, 0, # axis fill_value, @@ -5603,7 +5609,7 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde idx = cast(MultiIndex, idx) levels = list(idx.levels) + [lev] codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))] - mi = MultiIndex(levels=levels, codes=codes, names=list(idx.names) + [None]) + mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None]) else: nidx = len(idx) idx_codes = coerce_indexer_dtype(np.arange(nidx), idx) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 239d78b3b8b7a..2d10bd5d00eb2 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -117,6 +117,11 @@ class Grouper: A TimeGrouper is returned if ``freq`` is not ``None``. Otherwise, a Grouper is returned. + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby. + Examples -------- ``df.groupby(pd.Grouper(key="Animal"))`` is equivalent to ``df.groupby('Animal')`` @@ -263,7 +268,6 @@ def __init__( self.sort = sort self.dropna = dropna - self._grouper_deprecated = None self._indexer_deprecated: npt.NDArray[np.intp] | None = None self.binner = None self._grouper = None @@ -292,10 +296,6 @@ def _get_grouper( validate=validate, dropna=self.dropna, ) - # Without setting this, subsequent lookups to .groups raise - # error: Incompatible types in assignment (expression has type "BaseGrouper", - # variable has type "None") - self._grouper_deprecated = grouper # type: ignore[assignment] return grouper, obj diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8585ae3828247..0d88882c9b7ef 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -706,7 +706,7 @@ def groups(self) -> dict[Hashable, Index]: return self.groupings[0].groups result_index, ids = self.result_index_and_ids values = result_index._values - categories = Categorical(ids, categories=np.arange(len(result_index))) + categories = Categorical(ids, categories=range(len(result_index))) result = { # mypy is not aware that group has to be an integer values[group]: self.axis.take(axis_ilocs) # type: ignore[call-overload] diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 9b05eb42c6d6e..c5e3f3a50e10d 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -209,60 +209,6 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: indexes, kind = _sanitize_and_check(indexes) - def _unique_indices(inds, dtype) -> Index: - """ - Concatenate indices and remove duplicates. - - Parameters - ---------- - inds : list of Index or list objects - dtype : dtype to set for the resulting Index - - Returns - ------- - Index - """ - if all(isinstance(ind, Index) for ind in inds): - inds = [ind.astype(dtype, copy=False) for ind in inds] - result = inds[0].unique() - other = inds[1].append(inds[2:]) - diff = other[result.get_indexer_for(other) == -1] - if len(diff): - result = result.append(diff.unique()) - if sort: - result = result.sort_values() - return result - - def conv(i): - if isinstance(i, Index): - i = i.tolist() - return i - - return Index( - lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort), - dtype=dtype, - ) - - def _find_common_index_dtype(inds): - """ - Finds a common type for the indexes to pass through to resulting index. - - Parameters - ---------- - inds: list of Index or list objects - - Returns - ------- - The common type or None if no indexes were given - """ - dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)] - if dtypes: - dtype = find_common_type(dtypes) - else: - dtype = None - - return dtype - if kind == "special": result = indexes[0] @@ -294,18 +240,36 @@ def _find_common_index_dtype(inds): return result elif kind == "array": - dtype = _find_common_index_dtype(indexes) - index = indexes[0] - if not all(index.equals(other) for other in indexes[1:]): - index = _unique_indices(indexes, dtype) + if not all_indexes_same(indexes): + dtype = find_common_type([idx.dtype for idx in indexes]) + inds = [ind.astype(dtype, copy=False) for ind in indexes] + index = inds[0].unique() + other = inds[1].append(inds[2:]) + diff = other[index.get_indexer_for(other) == -1] + if len(diff): + index = index.append(diff.unique()) + if sort: + index = index.sort_values() + else: + index = indexes[0] name = get_unanimous_names(*indexes)[0] if name != index.name: index = index.rename(name) return index - else: # kind='list' - dtype = _find_common_index_dtype(indexes) - return _unique_indices(indexes, dtype) + elif kind == "list": + dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)] + if dtypes: + dtype = find_common_type(dtypes) + else: + dtype = None + all_lists = (idx.tolist() if isinstance(idx, Index) else idx for idx in indexes) + return Index( + lib.fast_unique_multiple_list_gen(all_lists, sort=bool(sort)), + dtype=dtype, + ) + else: + raise ValueError(f"{kind=} must be 'special', 'array' or 'list'.") def _sanitize_and_check(indexes): @@ -329,14 +293,14 @@ def _sanitize_and_check(indexes): sanitized_indexes : list of Index or list objects type : {'list', 'array', 'special'} """ - kinds = list({type(index) for index in indexes}) + kinds = {type(index) for index in indexes} if list in kinds: if len(kinds) > 1: indexes = [ Index(list(x)) if not isinstance(x, Index) else x for x in indexes ] - kinds.remove(list) + kinds -= {list} else: return indexes, "list" diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 30cf6f0b866ee..69f916bb3f769 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -174,9 +174,9 @@ disallow_ndim_indexing, is_valid_positional_slice, ) +from pandas.core.indexes.frozen import FrozenList from pandas.core.missing import clean_reindex_fill_method from pandas.core.ops import get_op_result_name -from pandas.core.ops.invalid import make_invalid_op from pandas.core.sorting import ( ensure_key_mapped, get_group_index_sorter, @@ -1727,8 +1727,8 @@ def _get_default_index_names( return names - def _get_names(self) -> tuple[Hashable | None, ...]: - return (self.name,) + def _get_names(self) -> FrozenList: + return FrozenList((self.name,)) def _set_names(self, values, *, level=None) -> None: """ @@ -1822,7 +1822,7 @@ def set_names(self, names, *, level=None, inplace: bool = False) -> Self | None: ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], - names=('species', 'year')) + names=['species', 'year']) When renaming levels with a dict, levels can not be passed. @@ -1831,7 +1831,7 @@ def set_names(self, names, *, level=None, inplace: bool = False) -> Self | None: ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], - names=('snake', 'year')) + names=['snake', 'year']) """ if level is not None and not isinstance(self, ABCMultiIndex): raise ValueError("Level must be None for non-MultiIndex") @@ -1916,13 +1916,13 @@ def rename(self, name, *, inplace: bool = False) -> Self | None: ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], - names=('kind', 'year')) + names=['kind', 'year']) >>> idx.rename(["species", "year"]) MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], - names=('species', 'year')) + names=['species', 'year']) >>> idx.rename("species") Traceback (most recent call last): TypeError: Must pass list-like as `names`. @@ -2086,22 +2086,22 @@ def droplevel(self, level: IndexLabel = 0): >>> mi MultiIndex([(1, 3, 5), (2, 4, 6)], - names=('x', 'y', 'z')) + names=['x', 'y', 'z']) >>> mi.droplevel() MultiIndex([(3, 5), (4, 6)], - names=('y', 'z')) + names=['y', 'z']) >>> mi.droplevel(2) MultiIndex([(1, 3), (2, 4)], - names=('x', 'y')) + names=['x', 'y']) >>> mi.droplevel("z") MultiIndex([(1, 3), (2, 4)], - names=('x', 'y')) + names=['x', 'y']) >>> mi.droplevel(["x", "y"]) Index([5, 6], dtype='int64', name='z') @@ -2543,7 +2543,7 @@ def notna(self) -> npt.NDArray[np.bool_]: notnull = notna - def fillna(self, value=None): + def fillna(self, value): """ Fill NA/NaN values with the specified value. @@ -4438,9 +4438,7 @@ def _join_level( """ from pandas.core.indexes.multi import MultiIndex - def _get_leaf_sorter( - labels: tuple[np.ndarray, ...] | list[np.ndarray], - ) -> npt.NDArray[np.intp]: + def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]: """ Returns sorter for the inner most level while preserving the order of higher levels. @@ -6185,13 +6183,13 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]: array([ True, False, False]) >>> midx = pd.MultiIndex.from_arrays( - ... [[1, 2, 3], ["red", "blue", "green"]], names=("number", "color") + ... [[1, 2, 3], ["red", "blue", "green"]], names=["number", "color"] ... ) >>> midx MultiIndex([(1, 'red'), (2, 'blue'), (3, 'green')], - names=('number', 'color')) + names=['number', 'color']) Check whether the strings in the 'color' level of the MultiIndex are in a list of colors. @@ -6453,6 +6451,10 @@ def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: >>> idx = pd.Index(list("abcd")) >>> idx.slice_locs(start="b", end="c") (1, 3) + + >>> idx = pd.Index(list("bcde")) + >>> idx.slice_locs(start="a", end="c") + (0, 2) """ inc = step is None or step >= 0 @@ -6938,14 +6940,8 @@ def _maybe_disable_logical_methods(self, opname: str_t) -> None: """ raise if this Index subclass does not support any or all. """ - if ( - isinstance(self, ABCMultiIndex) - # TODO(3.0): PeriodArray and DatetimeArray any/all will raise, - # so checking needs_i8_conversion will be unnecessary - or (needs_i8_conversion(self.dtype) and self.dtype.kind != "m") - ): - # This call will raise - make_invalid_op(opname)(self) + if isinstance(self, ABCMultiIndex): + raise TypeError(f"cannot perform {opname} with {type(self).__name__}") @Appender(IndexOpsMixin.argmin.__doc__) def argmin(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: @@ -6953,11 +6949,11 @@ def argmin(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: nv.validate_minmax_axis(axis) if not self._is_multi and self.hasnans: - # Take advantage of cache - if self._isnan.all(): - raise ValueError("Encountered all NA values") - elif not skipna: + if not skipna: raise ValueError("Encountered an NA value with skipna=False") + elif self._isnan.all(): + raise ValueError("Encountered all NA values") + return super().argmin(skipna=skipna) @Appender(IndexOpsMixin.argmax.__doc__) @@ -6966,11 +6962,10 @@ def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: nv.validate_minmax_axis(axis) if not self._is_multi and self.hasnans: - # Take advantage of cache - if self._isnan.all(): - raise ValueError("Encountered all NA values") - elif not skipna: + if not skipna: raise ValueError("Encountered an NA value with skipna=False") + elif self._isnan.all(): + raise ValueError("Encountered all NA values") return super().argmax(skipna=skipna) def min(self, axis=None, skipna: bool = True, *args, **kwargs): @@ -7135,17 +7130,25 @@ def maybe_sequence_to_range(sequence) -> Any | range: ------- Any : input or range """ - if isinstance(sequence, (ABCSeries, Index, range, ExtensionArray)): + if isinstance(sequence, (range, ExtensionArray)): return sequence elif len(sequence) == 1 or lib.infer_dtype(sequence, skipna=False) != "integer": return sequence - elif len(sequence) == 0: + elif isinstance(sequence, (ABCSeries, Index)) and not ( + isinstance(sequence.dtype, np.dtype) and sequence.dtype.kind == "i" + ): + return sequence + if len(sequence) == 0: return range(0) - diff = sequence[1] - sequence[0] + try: + np_sequence = np.asarray(sequence, dtype=np.int64) + except OverflowError: + return sequence + diff = np_sequence[1] - np_sequence[0] if diff == 0: return sequence - elif len(sequence) == 2 or lib.is_sequence_range(np.asarray(sequence), diff): - return range(sequence[0], sequence[-1] + diff, diff) + elif len(sequence) == 2 or lib.is_sequence_range(np_sequence, diff): + return range(np_sequence[0], np_sequence[-1] + diff, diff) else: return sequence @@ -7174,7 +7177,7 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: >>> ensure_index_from_sequences([["a", "a"], ["a", "b"]], names=["L1", "L2"]) MultiIndex([('a', 'a'), ('a', 'b')], - names=('L1', 'L2')) + names=['L1', 'L2']) See Also -------- diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py new file mode 100644 index 0000000000000..c559c529586b5 --- /dev/null +++ b/pandas/core/indexes/frozen.py @@ -0,0 +1,121 @@ +""" +frozen (immutable) data structures to support MultiIndexing + +These are used for: + +- .names (FrozenList) + +""" + +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + NoReturn, +) + +from pandas.core.base import PandasObject + +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from pandas._typing import Self + + +class FrozenList(PandasObject, list): + """ + Container that doesn't allow setting item *but* + because it's technically hashable, will be used + for lookups, appropriately, etc. + """ + + # Side note: This has to be of type list. Otherwise, + # it messes up PyTables type checks. + + def union(self, other) -> FrozenList: + """ + Returns a FrozenList with other concatenated to the end of self. + + Parameters + ---------- + other : array-like + The array-like whose elements we are concatenating. + + Returns + ------- + FrozenList + The collection difference between self and other. + """ + if isinstance(other, tuple): + other = list(other) + return type(self)(super().__add__(other)) + + def difference(self, other) -> FrozenList: + """ + Returns a FrozenList with elements from other removed from self. + + Parameters + ---------- + other : array-like + The array-like whose elements we are removing self. + + Returns + ------- + FrozenList + The collection difference between self and other. + """ + other = set(other) + temp = [x for x in self if x not in other] + return type(self)(temp) + + # TODO: Consider deprecating these in favor of `union` (xref gh-15506) + # error: Incompatible types in assignment (expression has type + # "Callable[[FrozenList, Any], FrozenList]", base class "list" defined the + # type as overloaded function) + __add__ = __iadd__ = union # type: ignore[assignment] + + def __getitem__(self, n): + if isinstance(n, slice): + return type(self)(super().__getitem__(n)) + return super().__getitem__(n) + + def __radd__(self, other) -> Self: + if isinstance(other, tuple): + other = list(other) + return type(self)(other + list(self)) + + def __eq__(self, other: object) -> bool: + if isinstance(other, (tuple, FrozenList)): + other = list(other) + return super().__eq__(other) + + __req__ = __eq__ + + def __mul__(self, other) -> Self: + return type(self)(super().__mul__(other)) + + __imul__ = __mul__ + + def __reduce__(self): + return type(self), (list(self),) + + # error: Signature of "__hash__" incompatible with supertype "list" + def __hash__(self) -> int: # type: ignore[override] + return hash(tuple(self)) + + def _disabled(self, *args, **kwargs) -> NoReturn: + """ + This method will not function because object is immutable. + """ + raise TypeError(f"'{type(self).__name__}' does not support mutable operations.") + + def __str__(self) -> str: + return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n")) + + def __repr__(self) -> str: + return f"{type(self).__name__}({self!s})" + + __setitem__ = __setslice__ = _disabled # type: ignore[assignment] + __delitem__ = __delslice__ = _disabled + pop = append = extend = _disabled + remove = sort = insert = _disabled # type: ignore[assignment] diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2e554bc848ffe..9df0d26ce622a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -101,6 +101,7 @@ ensure_index, get_unanimous_names, ) +from pandas.core.indexes.frozen import FrozenList from pandas.core.ops.invalid import make_invalid_op from pandas.core.sorting import ( get_group_index, @@ -299,7 +300,7 @@ class MultiIndex(Index): (1, 'blue'), (2, 'red'), (2, 'blue')], - names=('number', 'color')) + names=['number', 'color']) See further examples for how to construct a MultiIndex in the doc strings of the mentioned helper methods. @@ -309,9 +310,9 @@ class MultiIndex(Index): # initialize to zero-length tuples to make everything work _typ = "multiindex" - _names: tuple[Hashable | None, ...] = () - _levels: tuple[Index, ...] = () - _codes: tuple[np.ndarray, ...] = () + _names: list[Hashable | None] = [] + _levels = FrozenList() + _codes = FrozenList() _comparables = ["names"] sortorder: int | None @@ -347,7 +348,7 @@ def __new__( result._set_levels(levels, copy=copy, validate=False) result._set_codes(codes, copy=copy, validate=False) - result._names = (None,) * len(levels) + result._names = [None] * len(levels) if names is not None: # handles name validation result._set_names(names) @@ -389,16 +390,16 @@ def _validate_codes(self, level: Index, code: np.ndarray) -> np.ndarray: def _verify_integrity( self, - codes: tuple | None = None, - levels: tuple | None = None, + codes: list | None = None, + levels: list | None = None, levels_to_verify: list[int] | range | None = None, - ) -> tuple: + ) -> FrozenList: """ Parameters ---------- - codes : optional tuple + codes : optional list Codes to check for validity. Defaults to current codes. - levels : optional tuple + levels : optional list Levels to check for validity. Defaults to current levels. levels_to_validate: optional list Specifies the levels to verify. @@ -462,7 +463,7 @@ def _verify_integrity( else: result_codes.append(codes[i]) - new_codes = tuple(result_codes) + new_codes = FrozenList(result_codes) return new_codes @classmethod @@ -505,7 +506,7 @@ def from_arrays( (1, 'blue'), (2, 'red'), (2, 'blue')], - names=('number', 'color')) + names=['number', 'color']) """ error_msg = "Input must be a list / sequence of array-likes." if not is_list_like(arrays): @@ -576,7 +577,7 @@ def from_tuples( (1, 'blue'), (2, 'red'), (2, 'blue')], - names=('number', 'color')) + names=['number', 'color']) """ if not is_list_like(tuples): raise TypeError("Input must be a list / sequence of tuple-likes.") @@ -659,7 +660,7 @@ def from_product( (1, 'purple'), (2, 'green'), (2, 'purple')], - names=('number', 'color')) + names=['number', 'color']) """ from pandas.core.reshape.util import cartesian_product @@ -728,7 +729,7 @@ def from_frame( ('HI', 'Precip'), ('NJ', 'Temp'), ('NJ', 'Precip')], - names=('a', 'b')) + names=['a', 'b']) Using explicit names, instead of the column names @@ -737,7 +738,7 @@ def from_frame( ('HI', 'Precip'), ('NJ', 'Temp'), ('NJ', 'Precip')], - names=('state', 'observation')) + names=['state', 'observation']) """ if not isinstance(df, ABCDataFrame): raise TypeError("Input must be a DataFrame") @@ -760,9 +761,7 @@ def _values(self) -> np.ndarray: vals = index if isinstance(vals.dtype, CategoricalDtype): vals = cast("CategoricalIndex", vals) - # Incompatible types in assignment (expression has type - # "ExtensionArray | ndarray[Any, Any]", variable has type "Index") - vals = vals._data._internal_get_values() # type: ignore[assignment] + vals = vals._data._internal_get_values() if isinstance(vals.dtype, ExtensionDtype) or lib.is_np_dtype( vals.dtype, "mM" @@ -812,7 +811,7 @@ def dtypes(self) -> Series: (1, 'purple'), (2, 'green'), (2, 'purple')], - names=('number', 'color')) + names=['number', 'color']) >>> idx.dtypes number int64 color object @@ -838,7 +837,7 @@ def size(self) -> int: # Levels Methods @cache_readonly - def levels(self) -> tuple[Index, ...]: + def levels(self) -> FrozenList: """ Levels of the MultiIndex. @@ -871,8 +870,7 @@ def levels(self) -> tuple[Index, ...]: dog 4 >>> leg_num.index.levels - (Index(['mammal'], dtype='object', name='Category'), - Index(['cat', 'dog', 'goat', 'human'], dtype='object', name='Animals')) + FrozenList([['mammal'], ['cat', 'dog', 'goat', 'human']]) MultiIndex levels will not change even if the DataFrame using the MultiIndex does not contain all them anymore. @@ -887,8 +885,7 @@ def levels(self) -> tuple[Index, ...]: dog 4 >>> large_leg_num.index.levels - (Index(['mammal'], dtype='object', name='Category'), - Index(['cat', 'dog', 'goat', 'human'], dtype='object', name='Animals')) + FrozenList([['mammal'], ['cat', 'dog', 'goat', 'human']]) """ # Use cache_readonly to ensure that self.get_locs doesn't repeatedly # create new IndexEngine @@ -897,7 +894,7 @@ def levels(self) -> tuple[Index, ...]: for level in result: # disallow midx.levels[0].name = "foo" level._no_setting_name = True - return tuple(result) + return FrozenList(result) def _set_levels( self, @@ -920,14 +917,16 @@ def _set_levels( raise ValueError("Length of levels must match length of level.") if level is None: - new_levels = tuple(ensure_index(lev, copy=copy)._view() for lev in levels) + new_levels = FrozenList( + ensure_index(lev, copy=copy)._view() for lev in levels + ) level_numbers: range | list[int] = range(len(new_levels)) else: level_numbers = [self._get_level_number(lev) for lev in level] new_levels_list = list(self._levels) for lev_num, lev in zip(level_numbers, levels): new_levels_list[lev_num] = ensure_index(lev, copy=copy)._view() - new_levels = tuple(new_levels_list) + new_levels = FrozenList(new_levels_list) if verify_integrity: new_codes = self._verify_integrity( @@ -936,7 +935,7 @@ def _set_levels( self._codes = new_codes names = self.names - self._levels: tuple[Index, ...] = new_levels + self._levels = new_levels if any(names): self._set_names(names) @@ -981,7 +980,7 @@ def set_levels( (2, 'two'), (3, 'one'), (3, 'two')], - names=('foo', 'bar')) + names=['foo', 'bar']) >>> idx.set_levels([["a", "b", "c"], [1, 2]]) MultiIndex([('a', 1), @@ -990,7 +989,7 @@ def set_levels( ('b', 2), ('c', 1), ('c', 2)], - names=('foo', 'bar')) + names=['foo', 'bar']) >>> idx.set_levels(["a", "b", "c"], level=0) MultiIndex([('a', 'one'), ('a', 'two'), @@ -998,7 +997,7 @@ def set_levels( ('b', 'two'), ('c', 'one'), ('c', 'two')], - names=('foo', 'bar')) + names=['foo', 'bar']) >>> idx.set_levels(["a", "b"], level="bar") MultiIndex([(1, 'a'), (1, 'b'), @@ -1006,7 +1005,7 @@ def set_levels( (2, 'b'), (3, 'a'), (3, 'b')], - names=('foo', 'bar')) + names=['foo', 'bar']) If any of the levels passed to ``set_levels()`` exceeds the existing length, all of the values from that argument will @@ -1020,10 +1019,10 @@ def set_levels( ('b', 2), ('c', 1), ('c', 2)], - names=('foo', 'bar')) + names=['foo', 'bar']) >>> idx.set_levels([["a", "b", "c"], [1, 2, 3, 4]], level=[0, 1]).levels - (Index(['a', 'b', 'c'], dtype='object', name='foo'), Index([1, 2, 3, 4], dtype='int64', name='bar')) - """ # noqa: E501 + FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) + """ if isinstance(levels, Index): pass @@ -1076,7 +1075,7 @@ def levshape(self) -> Shape: # Codes Methods @property - def codes(self) -> tuple: + def codes(self) -> FrozenList: """ Codes of the MultiIndex. @@ -1098,7 +1097,7 @@ def codes(self) -> tuple: >>> arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] >>> mi = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) >>> mi.codes - (array([0, 0, 1, 1], dtype=int8), array([1, 0, 1, 0], dtype=int8)) + FrozenList([[0, 0, 1, 1], [1, 0, 1, 0]]) """ return self._codes @@ -1119,7 +1118,7 @@ def _set_codes( level_numbers: list[int] | range if level is None: - new_codes = tuple( + new_codes = FrozenList( _coerce_indexer_frozen(level_codes, lev, copy=copy).view() for lev, level_codes in zip(self._levels, codes) ) @@ -1132,7 +1131,7 @@ def _set_codes( new_codes_list[lev_num] = _coerce_indexer_frozen( level_codes, lev, copy=copy ) - new_codes = tuple(new_codes_list) + new_codes = FrozenList(new_codes_list) if verify_integrity: new_codes = self._verify_integrity( @@ -1173,32 +1172,32 @@ def set_codes( (1, 'two'), (2, 'one'), (2, 'two')], - names=('foo', 'bar')) + names=['foo', 'bar']) >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]]) MultiIndex([(2, 'one'), (1, 'one'), (2, 'two'), (1, 'two')], - names=('foo', 'bar')) + names=['foo', 'bar']) >>> idx.set_codes([1, 0, 1, 0], level=0) MultiIndex([(2, 'one'), (1, 'two'), (2, 'one'), (1, 'two')], - names=('foo', 'bar')) + names=['foo', 'bar']) >>> idx.set_codes([0, 0, 1, 1], level="bar") MultiIndex([(1, 'one'), (1, 'one'), (2, 'two'), (2, 'two')], - names=('foo', 'bar')) + names=['foo', 'bar']) >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]], level=[0, 1]) MultiIndex([(2, 'one'), (1, 'one'), (2, 'two'), (1, 'two')], - names=('foo', 'bar')) + names=['foo', 'bar']) """ level, codes = _require_listlike(level, codes, "Codes") @@ -1451,7 +1450,6 @@ def _format_multi( if len(self) == 0: return [] - formatted: Iterable stringified_levels = [] for lev, level_codes in zip(self.levels, self.codes): na = _get_na_rep(lev.dtype) @@ -1476,9 +1474,7 @@ def _format_multi( stringified_levels.append(formatted) result_levels = [] - # Incompatible types in assignment (expression has type "Iterable[Any]", - # variable has type "Index") - for lev, lev_name in zip(stringified_levels, self.names): # type: ignore[assignment] + for lev, lev_name in zip(stringified_levels, self.names): level = [] if include_names: @@ -1510,8 +1506,8 @@ def _format_multi( # -------------------------------------------------------------------- # Names Methods - def _get_names(self) -> tuple[Hashable | None, ...]: - return self._names + def _get_names(self) -> FrozenList: + return FrozenList(self._names) def _set_names(self, names, *, level=None, validate: bool = True) -> None: """ @@ -1558,7 +1554,6 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: level = [self._get_level_number(lev) for lev in level] # set the name - new_names = list(self._names) for lev, name in zip(level, names): if name is not None: # GH 20527 @@ -1567,8 +1562,7 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: raise TypeError( f"{type(self).__name__}.name must be a hashable type" ) - new_names[lev] = name - self._names = tuple(new_names) + self._names[lev] = name # If .levels has been accessed, the names in our cache will be stale. self._reset_cache() @@ -1587,9 +1581,9 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: >>> mi MultiIndex([(1, 3, 5), (2, 4, 6)], - names=('x', 'y', 'z')) + names=['x', 'y', 'z']) >>> mi.names - ('x', 'y', 'z') + FrozenList(['x', 'y', 'z']) """, ) @@ -1681,7 +1675,7 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: # (previously declared in base class "IndexOpsMixin") _duplicated = duplicated # type: ignore[misc] - def fillna(self, value=None, downcast=None): + def fillna(self, value, downcast=None): """ fillna is not implemented for MultiIndex """ @@ -2063,7 +2057,7 @@ def remove_unused_levels(self) -> MultiIndex: >>> mi2 = mi[2:].remove_unused_levels() >>> mi2.levels - (RangeIndex(start=1, stop=2, step=1), Index(['a', 'b'], dtype='object')) + FrozenList([[1], ['a', 'b']]) """ new_levels = [] new_codes = [] @@ -2337,13 +2331,13 @@ def drop( # type: ignore[override] (1, 'purple'), (2, 'green'), (2, 'purple')], - names=('number', 'color')) + names=['number', 'color']) >>> idx.drop([(1, "green"), (2, "purple")]) MultiIndex([(0, 'green'), (0, 'purple'), (1, 'purple'), (2, 'green')], - names=('number', 'color')) + names=['number', 'color']) We can also drop from a specific level. @@ -2351,12 +2345,12 @@ def drop( # type: ignore[override] MultiIndex([(0, 'purple'), (1, 'purple'), (2, 'purple')], - names=('number', 'color')) + names=['number', 'color']) >>> idx.drop([1, 2], level=0) MultiIndex([(0, 'green'), (0, 'purple')], - names=('number', 'color')) + names=['number', 'color']) """ if level is not None: return self._drop_from_level(codes, level, errors) @@ -2497,17 +2491,17 @@ def reorder_levels(self, order) -> MultiIndex: >>> mi MultiIndex([(1, 3), (2, 4)], - names=('x', 'y')) + names=['x', 'y']) >>> mi.reorder_levels(order=[1, 0]) MultiIndex([(3, 1), (4, 2)], - names=('y', 'x')) + names=['y', 'x']) >>> mi.reorder_levels(order=["y", "x"]) MultiIndex([(3, 1), (4, 2)], - names=('y', 'x')) + names=['y', 'x']) """ order = [self._get_level_number(i) for i in order] result = self._reorder_ilevels(order) @@ -2803,7 +2797,6 @@ def get_slice_bound( label = (label,) return self._partial_tup_index(label, side=side) - # pylint: disable-next=useless-parent-delegation def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: """ For an ordered MultiIndex, compute the slice locations for input @@ -2877,9 +2870,7 @@ def _partial_tup_index(self, tup: tuple, side: Literal["left", "right"] = "left" if lab not in lev and not isna(lab): # short circuit try: - # Argument 1 to "searchsorted" has incompatible type "Index"; - # expected "ExtensionArray | ndarray[Any, Any]" - loc = algos.searchsorted(lev, lab, side=side) # type: ignore[arg-type] + loc = algos.searchsorted(lev, lab, side=side) except TypeError as err: # non-comparable e.g. test_slice_locs_with_type_mismatch raise TypeError(f"Level type mismatch: {lab}") from err @@ -3547,7 +3538,7 @@ def _reorder_indexer( k_codes = self.levels[i].get_indexer(k) k_codes = k_codes[k_codes >= 0] # Filter absent keys # True if the given codes are not ordered - need_sort = bool((k_codes[:-1] > k_codes[1:]).any()) + need_sort = (k_codes[:-1] > k_codes[1:]).any() else: need_sort = True elif isinstance(k, slice): @@ -3980,7 +3971,7 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]: __invert__ = make_invalid_op("__invert__") -def _lexsort_depth(codes: tuple[np.ndarray], nlevels: int) -> int: +def _lexsort_depth(codes: list[np.ndarray], nlevels: int) -> int: """Count depth (up to a maximum of `nlevels`) with which codes are lexsorted.""" int64_codes = [ensure_int64(level_codes) for level_codes in codes] for k in range(nlevels, 0, -1): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 982e305b7e471..c9b502add21e0 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -899,7 +899,7 @@ def __setitem__(self, key, value) -> None: check_dict_or_set_indexers(key) if isinstance(key, tuple): - key = tuple(list(x) if is_iterator(x) else x for x in key) + key = (list(x) if is_iterator(x) else x for x in key) key = tuple(com.apply_if_callable(x, self.obj) for x in key) else: maybe_callable = com.apply_if_callable(key, self.obj) @@ -1177,7 +1177,7 @@ def _check_deprecated_callable_usage(self, key: Any, maybe_callable: T) -> T: def __getitem__(self, key): check_dict_or_set_indexers(key) if type(key) is tuple: - key = tuple(list(x) if is_iterator(x) else x for x in key) + key = (list(x) if is_iterator(x) else x for x in key) key = tuple(com.apply_if_callable(x, self.obj) for x in key) if self._is_scalar_access(key): return self.obj._get_value(*key, takeable=self._takeable) @@ -1191,13 +1191,13 @@ def __getitem__(self, key): return self._getitem_axis(maybe_callable, axis=axis) def _is_scalar_access(self, key: tuple): - raise NotImplementedError() + raise NotImplementedError def _getitem_tuple(self, tup: tuple): raise AbstractMethodError(self) def _getitem_axis(self, key, axis: AxisInt): - raise NotImplementedError() + raise NotImplementedError def _has_valid_setitem_indexer(self, indexer) -> bool: raise AbstractMethodError(self) diff --git a/pandas/core/interchange/buffer.py b/pandas/core/interchange/buffer.py index 5d24325e67f62..62bf396256f2a 100644 --- a/pandas/core/interchange/buffer.py +++ b/pandas/core/interchange/buffer.py @@ -114,7 +114,7 @@ def __dlpack__(self) -> Any: """ Represent this structure as DLPack interface. """ - raise NotImplementedError() + raise NotImplementedError def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: """ diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 31234fb1f116f..89c8a4a27ca31 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -6,9 +6,9 @@ ) __all__ = [ - "Block", # pylint: disable=undefined-all-variable - "DatetimeTZBlock", # pylint: disable=undefined-all-variable - "ExtensionBlock", # pylint: disable=undefined-all-variable + "Block", + "DatetimeTZBlock", + "ExtensionBlock", "make_block", "BlockManager", "SingleBlockManager", diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index d6e1e8b38dfe3..ef25d7ed5ae9e 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -18,10 +18,14 @@ from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, + ExtensionDtype, PeriodDtype, ) -from pandas.core.arrays import DatetimeArray +from pandas.core.arrays import ( + DatetimeArray, + TimedeltaArray, +) from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( check_ndim, @@ -32,11 +36,43 @@ ) if TYPE_CHECKING: - from pandas._typing import Dtype + from pandas._typing import ( + ArrayLike, + Dtype, + ) from pandas.core.internals.blocks import Block +def _make_block(values: ArrayLike, placement: np.ndarray) -> Block: + """ + This is an analogue to blocks.new_block(_2d) that ensures: + 1) correct dimension for EAs that support 2D (`ensure_block_shape`), and + 2) correct EA class for datetime64/timedelta64 (`maybe_coerce_values`). + + The input `values` is assumed to be either numpy array or ExtensionArray: + - In case of a numpy array, it is assumed to already be in the expected + shape for Blocks (2D, (cols, rows)). + - In case of an ExtensionArray the input can be 1D, also for EAs that are + internally stored as 2D. + + For the rest no preprocessing or validation is done, except for those dtypes + that are internally stored as EAs but have an exact numpy equivalent (and at + the moment use that numpy dtype), i.e. datetime64/timedelta64. + """ + dtype = values.dtype + klass = get_block_type(dtype) + placement_obj = BlockPlacement(placement) + + if (isinstance(dtype, ExtensionDtype) and dtype._supports_2d) or isinstance( + values, (DatetimeArray, TimedeltaArray) + ): + values = ensure_block_shape(values, ndim=2) + + values = maybe_coerce_values(values) + return klass(values, ndim=2, placement=placement_obj) + + def make_block( values, placement, klass=None, ndim=None, dtype: Dtype | None = None ) -> Block: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 468ec32ce7760..7be1d5d95ffdf 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1388,12 +1388,10 @@ def interpolate( # If there are no NAs, then interpolate is a no-op return [self.copy(deep=False)] - # TODO(3.0): this case will not be reachable once GH#53638 is enforced if self.dtype == _dtype_obj: - # only deal with floats - # bc we already checked that can_hold_na, we don't have int dtype here - # test_interp_basic checks that we make a copy here - return [self.copy(deep=False)] + # GH#53631 + name = {1: "Series", 2: "DataFrame"}[self.ndim] + raise TypeError(f"{name} cannot interpolate with object dtype.") copy, refs = self._get_refs_and_copy(inplace) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 73b93110c9018..cea52bf8c91b2 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -567,7 +567,7 @@ def _extract_index(data) -> Index: if len(data) == 0: return default_index(0) - raw_lengths = [] + raw_lengths = set() indexes: list[list[Hashable] | Index] = [] have_raw_arrays = False @@ -583,7 +583,7 @@ def _extract_index(data) -> Index: indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True - raw_lengths.append(len(val)) + raw_lengths.add(len(val)) elif isinstance(val, np.ndarray) and val.ndim > 1: raise ValueError("Per-column arrays must each be 1-dimensional") @@ -596,24 +596,23 @@ def _extract_index(data) -> Index: index = union_indexes(indexes, sort=False) if have_raw_arrays: - lengths = list(set(raw_lengths)) - if len(lengths) > 1: + if len(raw_lengths) > 1: raise ValueError("All arrays must be of the same length") if have_dicts: raise ValueError( "Mixing dicts with non-Series may lead to ambiguous ordering." ) - + raw_length = raw_lengths.pop() if have_series: - if lengths[0] != len(index): + if raw_length != len(index): msg = ( - f"array length {lengths[0]} does not match index " + f"array length {raw_length} does not match index " f"length {len(index)}" ) raise ValueError(msg) else: - index = default_index(lengths[0]) + index = default_index(raw_length) return ensure_index(index) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a124e8679ae8e..22092551ec882 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -520,7 +520,7 @@ def nanany( if values.dtype.kind == "M": # GH#34479 - raise TypeError("datetime64 type does not support operation: 'any'") + raise TypeError("datetime64 type does not support operation 'any'") values, _ = _get_values(values, skipna, fill_value=False, mask=mask) @@ -576,7 +576,7 @@ def nanall( if values.dtype.kind == "M": # GH#34479 - raise TypeError("datetime64 type does not support operation: 'all'") + raise TypeError("datetime64 type does not support operation 'all'") values, _ = _get_values(values, skipna, fill_value=True, mask=mask) @@ -745,6 +745,10 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask= >>> s = pd.Series([1, np.nan, 2, 2]) >>> nanops.nanmedian(s.values) 2.0 + + >>> s = pd.Series([np.nan, np.nan, np.nan]) + >>> nanops.nanmedian(s.values) + nan """ # for floats without mask, the data already uses NaN as missing value # indicator, and `mask` will be calculated from that below -> in those @@ -763,6 +767,7 @@ def get_median(x, _mask=None): warnings.filterwarnings( "ignore", "All-NaN slice encountered", RuntimeWarning ) + warnings.filterwarnings("ignore", "Mean of empty slice", RuntimeWarning) res = np.nanmedian(x[_mask]) return res @@ -1428,20 +1433,15 @@ def _maybe_arg_null_out( return result if axis is None or not getattr(result, "ndim", False): - if skipna: - if mask.all(): - raise ValueError("Encountered all NA values") - else: - if mask.any(): - raise ValueError("Encountered an NA value with skipna=False") + if skipna and mask.all(): + raise ValueError("Encountered all NA values") + elif not skipna and mask.any(): + raise ValueError("Encountered an NA value with skipna=False") else: - na_mask = mask.all(axis) - if na_mask.any(): + if skipna and mask.all(axis).any(): raise ValueError("Encountered all NA values") - elif not skipna: - na_mask = mask.any(axis) - if na_mask.any(): - raise ValueError("Encountered an NA value with skipna=False") + elif not skipna and mask.any(axis).any(): + raise ValueError("Encountered an NA value with skipna=False") return result diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 810e30d369729..983a3df57e369 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -12,7 +12,6 @@ TYPE_CHECKING, Any, ) -import warnings import numpy as np @@ -29,7 +28,6 @@ is_supported_dtype, is_unitless, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, @@ -424,15 +422,13 @@ def fill_bool(x, left=None): right = lib.item_from_zerodim(right) if is_list_like(right) and not hasattr(right, "dtype"): # e.g. list, tuple - warnings.warn( + raise TypeError( + # GH#52264 "Logical ops (and, or, xor) between Pandas objects and dtype-less " - "sequences (e.g. list, tuple) are deprecated and will raise in a " - "future version. Wrap the object in a Series, Index, or np.array " + "sequences (e.g. list, tuple) are no longer supported. " + "Wrap the object in a Series, Index, or np.array " "before operating instead.", - FutureWarning, - stacklevel=find_stack_level(), ) - right = construct_1d_object_array_from_listlike(right) # NB: We assume extract_array has already been called on left and right lvalues = ensure_wrapped_if_datetimelike(left) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 43077e7aeecb4..4392f54d9c442 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1681,9 +1681,8 @@ def _wrap_result(self, result): if self.kind == "period" and not isinstance(result.index, PeriodIndex): if isinstance(result.index, MultiIndex): # GH 24103 - e.g. groupby resample - new_level = result.index.levels[-1] - if not isinstance(new_level, PeriodIndex): - new_level = new_level.to_period(self.freq) # type: ignore[attr-defined] + if not isinstance(result.index.levels[-1], PeriodIndex): + new_level = result.index.levels[-1].to_period(self.freq) result.index = result.index.set_levels(new_level, level=-1) else: result.index = result.index.to_period(self.freq) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 0868f711093d6..d17e5b475ae57 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -518,8 +518,11 @@ def _sanitize_mixed_ndim( # to have unique names name = current_column current_column += 1 - - obj = sample._constructor({name: obj}, copy=False) + obj = sample._constructor(obj, copy=False) + if isinstance(obj, ABCDataFrame): + obj.columns = range(name, name + 1, 1) + else: + obj = sample._constructor({name: obj}, copy=False) new_objs.append(obj) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index f51a833e5f906..b4720306094e9 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -237,7 +237,7 @@ def melt( else: mdata[col] = np.tile(id_data._values, num_cols_adjusted) - mcolumns = id_vars + list(var_name) + [value_name] + mcolumns = id_vars + var_name + [value_name] if frame.shape[1] > 0 and not any( not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index dcb638cfee97b..e6e84c2135b82 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -106,6 +106,7 @@ from pandas import DataFrame from pandas.core import groupby from pandas.core.arrays import DatetimeArray + from pandas.core.indexes.frozen import FrozenList _factorizers = { np.int64: libhashtable.Int64Factorizer, @@ -1803,7 +1804,7 @@ def restore_dropped_levels_multijoin( join_index: Index, lindexer: npt.NDArray[np.intp], rindexer: npt.NDArray[np.intp], -) -> tuple[tuple, tuple, tuple]: +) -> tuple[FrozenList, FrozenList, FrozenList]: """ *this is an internal non-public method* @@ -1835,7 +1836,7 @@ def restore_dropped_levels_multijoin( levels of combined multiindexes labels : np.ndarray[np.intp] labels of combined multiindexes - names : tuple[Hashable] + names : List[Hashable] names of combined multiindex levels """ @@ -1877,11 +1878,12 @@ def _convert_to_multiindex(index: Index) -> MultiIndex: else: restore_codes = algos.take_nd(codes, indexer, fill_value=-1) - join_levels = join_levels + (restore_levels,) - join_codes = join_codes + (restore_codes,) - join_names = join_names + (dropped_level_name,) + # error: Cannot determine type of "__add__" + join_levels = join_levels + [restore_levels] # type: ignore[has-type] + join_codes = join_codes + [restore_codes] # type: ignore[has-type] + join_names = join_names + [dropped_level_name] - return tuple(join_levels), tuple(join_codes), tuple(join_names) + return join_levels, join_codes, join_names class _OrderedMerge(_MergeOperation): @@ -2063,8 +2065,8 @@ def _validate_left_right_on(self, left_on, right_on): or is_string_dtype(ro_dtype) ): raise MergeError( - f"Incompatible merge dtype, {ro_dtype!r} and " - f"{lo_dtype!r}, both sides must have numeric dtype" + f"Incompatible merge dtype, {lo_dtype!r} and " + f"{ro_dtype!r}, both sides must have numeric dtype" ) # add 'by' to our key-list so we can have it in the diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b62f550662f5d..e0126d439a79c 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -397,7 +397,11 @@ def _all_key(key): if isinstance(piece.index, MultiIndex): # We are adding an empty level transformed_piece.index = MultiIndex.from_tuples( - [all_key], names=piece.index.names + (None,) + [all_key], + names=piece.index.names + + [ + None, + ], ) else: transformed_piece.index = Index([all_key], name=piece.index.name) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 0a2f7fe43b4b3..01cc85ceff181 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -35,6 +35,7 @@ factorize, unique, ) +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame @@ -61,6 +62,7 @@ ) from pandas.core.arrays import ExtensionArray + from pandas.core.indexes.frozen import FrozenList class _Unstacker: @@ -231,20 +233,31 @@ def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]: return new_values, mask.any(0) # TODO: in all tests we have mask.any(0).all(); can we rely on that? - def get_result(self, values, value_columns, fill_value) -> DataFrame: + def get_result(self, obj, value_columns, fill_value) -> DataFrame: + values = obj._values if values.ndim == 1: values = values[:, np.newaxis] if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError("must pass column labels for multi-column data") - values, _ = self.get_new_values(values, fill_value) + new_values, _ = self.get_new_values(values, fill_value) columns = self.get_new_columns(value_columns) index = self.new_index - return self.constructor( - values, index=index, columns=columns, dtype=values.dtype + result = self.constructor( + new_values, index=index, columns=columns, dtype=new_values.dtype, copy=False ) + if isinstance(values, np.ndarray): + base, new_base = values.base, new_values.base + elif isinstance(values, NDArrayBackedExtensionArray): + base, new_base = values._ndarray.base, new_values._ndarray.base + else: + base, new_base = 1, 2 # type: ignore[assignment] + if base is new_base: + # We can only get here if one of the dimensions is size 1 + result._mgr.add_references(obj._mgr) + return result def get_new_values(self, values, fill_value=None): if values.ndim == 1: @@ -337,15 +350,21 @@ def get_new_columns(self, value_columns: Index | None): width = len(value_columns) propagator = np.repeat(np.arange(width), stride) - new_levels: tuple[Index, ...] + new_levels: FrozenList | list[Index] if isinstance(value_columns, MultiIndex): - new_levels = value_columns.levels + (self.removed_level_full,) + # error: Cannot determine type of "__add__" [has-type] + new_levels = value_columns.levels + ( # type: ignore[has-type] + self.removed_level_full, + ) new_names = value_columns.names + (self.removed_name,) new_codes = [lab.take(propagator) for lab in value_columns.codes] else: - new_levels = (value_columns, self.removed_level_full) + new_levels = [ + value_columns, + self.removed_level_full, + ] new_names = [value_columns.name, self.removed_name] new_codes = [propagator] @@ -532,9 +551,7 @@ def unstack( unstacker = _Unstacker( obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort ) - return unstacker.get_result( - obj._values, value_columns=None, fill_value=fill_value - ) + return unstacker.get_result(obj, value_columns=None, fill_value=fill_value) def _unstack_frame( @@ -550,7 +567,7 @@ def _unstack_frame( return obj._constructor_from_mgr(mgr, axes=mgr.axes) else: return unstacker.get_result( - obj._values, value_columns=obj.columns, fill_value=fill_value + obj, value_columns=obj.columns, fill_value=fill_value ) @@ -977,26 +994,27 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: # Construct the correct MultiIndex by combining the frame's index and # stacked columns. + index_levels: list | FrozenList if isinstance(frame.index, MultiIndex): index_levels = frame.index.levels - index_codes = tuple(np.tile(frame.index.codes, (1, ratio))) + index_codes = list(np.tile(frame.index.codes, (1, ratio))) else: codes, uniques = factorize(frame.index, use_na_sentinel=False) - # Incompatible types in assignment (expression has type - # "tuple[ndarray[Any, Any] | Index]", variable has type "tuple[Index, ...]") - index_levels = (uniques,) # type: ignore[assignment] - index_codes = tuple(np.tile(codes, (1, ratio))) + index_levels = [uniques] + index_codes = list(np.tile(codes, (1, ratio))) if isinstance(ordered_stack_cols, MultiIndex): column_levels = ordered_stack_cols.levels column_codes = ordered_stack_cols.drop_duplicates().codes else: - column_levels = (ordered_stack_cols.unique(),) - column_codes = (factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0],) - column_codes = tuple(np.repeat(codes, len(frame)) for codes in column_codes) + column_levels = [ordered_stack_cols.unique()] + column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] + # error: Incompatible types in assignment (expression has type "list[ndarray[Any, + # dtype[Any]]]", variable has type "FrozenList") + column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment] result.index = MultiIndex( levels=index_levels + column_levels, codes=index_codes + column_codes, - names=frame.index.names + ordered_stack_cols.names, + names=frame.index.names + list(ordered_stack_cols.names), verify_integrity=False, ) diff --git a/pandas/core/series.py b/pandas/core/series.py index b0dc05fce7913..8b36bd0f381c5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -97,7 +97,6 @@ algorithms, base, common as com, - missing, nanops, ops, roperator, @@ -133,6 +132,7 @@ PeriodIndex, default_index, ensure_index, + maybe_sequence_to_range, ) import pandas.core.indexes.base as ibase from pandas.core.indexes.multi import maybe_droplevels @@ -539,8 +539,6 @@ def _init_dict( _data : BlockManager for the new Series index : index for the new Series """ - keys: Index | tuple - # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] # raises KeyError), so we iterate the entire dict, and align if data: @@ -548,7 +546,7 @@ def _init_dict( # using generators in effects the performance. # Below is the new way of extracting the keys and values - keys = tuple(data.keys()) + keys = maybe_sequence_to_range(tuple(data.keys())) values = list(data.values()) # Generating list of values- faster way elif index is not None: # fastpath for Series(data=None). Just use broadcasting a scalar @@ -636,6 +634,10 @@ def dtypes(self) -> DtypeObj: """ Return the dtype object of the underlying data. + See Also + -------- + DataFrame.dtypes : Return the dtypes in the DataFrame. + Examples -------- >>> s = pd.Series([1, 2, 3]) @@ -1471,7 +1473,6 @@ def __repr__(self) -> str: """ Return a string representation for a particular Series. """ - # pylint: disable=invalid-repr-returned repr_params = fmt.get_series_repr_params() return self.to_string(**repr_params) @@ -2061,7 +2062,7 @@ def mode(self, dropna: bool = True) -> Series: dtype=self.dtype, ).__finalize__(self, method="mode") - def unique(self) -> ArrayLike: # pylint: disable=useless-parent-delegation + def unique(self) -> ArrayLike: """ Return unique values of Series object. @@ -4257,6 +4258,10 @@ def unstack( DataFrame Unstacked Series. + See Also + -------- + DataFrame.unstack : Pivot the MultiIndex of a DataFrame. + Notes ----- Reference :ref:`the user guide ` for more examples. @@ -5112,40 +5117,6 @@ def info( show_counts=show_counts, ) - @overload - def _replace_single(self, to_replace, inplace: Literal[False]) -> Self: ... - - @overload - def _replace_single(self, to_replace, inplace: Literal[True]) -> None: ... - - @overload - def _replace_single(self, to_replace, inplace: bool) -> Self | None: ... - - # TODO(3.0): this can be removed once GH#33302 deprecation is enforced - def _replace_single(self, to_replace, inplace: bool) -> Self | None: - """ - Replaces values in a Series using the fill method specified when no - replacement value is given in the replace method - """ - limit = None - method = "pad" - - result = self if inplace else self.copy() - - values = result._values - mask = missing.mask_missing(values, to_replace) - - if isinstance(values, ExtensionArray): - # dispatch to the EA's _pad_mask_inplace method - values._fill_mask_inplace(method, limit, mask) - else: - fill_f = missing.get_fill_func(method) - fill_f(values, limit=limit, mask=mask) - - if inplace: - return None - return result - def memory_usage(self, index: bool = True, deep: bool = False) -> int: """ Return the memory usage of the Series. @@ -5644,6 +5615,8 @@ def to_timestamp( """ Cast to DatetimeIndex of Timestamps, at *beginning* of period. + This can be changed to the *end* of the period, by specifying `how="e"`. + Parameters ---------- freq : str, default frequency of PeriodIndex @@ -5671,6 +5644,12 @@ def to_timestamp( Returns ------- Series with DatetimeIndex + Series with the PeriodIndex cast to DatetimeIndex. + + See Also + -------- + Series.to_period: Inverse method to cast DatetimeIndex to PeriodIndex. + DataFrame.to_timestamp: Equivalent method for DataFrame. Examples -------- @@ -5744,6 +5723,11 @@ def to_period( Series Series with index converted to PeriodIndex. + See Also + -------- + DataFrame.to_period: Equivalent method for DataFrame. + Series.dt.to_period: Convert DateTime column values. + Examples -------- >>> idx = pd.DatetimeIndex(["2023", "2024", "2025"]) @@ -5879,17 +5863,12 @@ def _align_for_op(self, right, align_asobject: bool = False): object, np.bool_, ): - warnings.warn( - "Operation between non boolean Series with different " - "indexes will no longer return a boolean result in " - "a future version. Cast both Series to object type " - "to maintain the prior behavior.", - FutureWarning, - stacklevel=find_stack_level(), - ) - # to keep original value's dtype for bool ops - left = left.astype(object) - right = right.astype(object) + pass + # GH#52538 no longer cast in these cases + else: + # to keep original value's dtype for bool ops + left = left.astype(object) + right = right.astype(object) left, right = left.align(right) @@ -6207,6 +6186,7 @@ def any( # type: ignore[override] filter_type="bool", ) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="all") @Appender(make_doc("all", ndim=1)) def all( self, @@ -6226,6 +6206,7 @@ def all( filter_type="bool", ) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="min") @doc(make_doc("min", ndim=1)) def min( self, @@ -6238,6 +6219,7 @@ def min( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="max") @doc(make_doc("max", ndim=1)) def max( self, @@ -6250,6 +6232,7 @@ def max( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sum") @doc(make_doc("sum", ndim=1)) def sum( self, @@ -6268,6 +6251,7 @@ def sum( **kwargs, ) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="prod") @doc(make_doc("prod", ndim=1)) def prod( self, @@ -6286,6 +6270,7 @@ def prod( **kwargs, ) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="mean") @doc(make_doc("mean", ndim=1)) def mean( self, @@ -6298,6 +6283,7 @@ def mean( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="median") @doc(make_doc("median", ndim=1)) def median( self, @@ -6310,6 +6296,7 @@ def median( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sem") @doc(make_doc("sem", ndim=1)) def sem( self, @@ -6328,6 +6315,7 @@ def sem( **kwargs, ) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="var") @doc(make_doc("var", ndim=1)) def var( self, @@ -6346,6 +6334,7 @@ def var( **kwargs, ) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="std") @doc(make_doc("std", ndim=1)) def std( self, @@ -6364,6 +6353,7 @@ def std( **kwargs, ) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="skew") @doc(make_doc("skew", ndim=1)) def skew( self, @@ -6376,6 +6366,7 @@ def skew( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) + @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="kurt") @doc(make_doc("kurt", ndim=1)) def kurt( self, diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 2d8517693a2f8..38a443b56ee3d 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -608,24 +608,7 @@ 4 None dtype: object - When ``value`` is not explicitly passed and `to_replace` is a scalar, list - or tuple, `replace` uses the method parameter (default 'pad') to do the - replacement. So this is why the 'a' values are being replaced by 10 - in rows 1 and 2 and 'b' in row 4 in this case. - - >>> s.replace('a') - 0 10 - 1 10 - 2 10 - 3 b - 4 b - dtype: object - - .. deprecated:: 2.1.0 - The 'method' parameter and padding behavior are deprecated. - - On the other hand, if ``None`` is explicitly passed for ``value``, it will - be respected: + If ``None`` is explicitly passed for ``value``, it will be respected: >>> s.replace('a', None) 0 10 diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 493e856c6dcc6..4fba243f73536 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -172,8 +172,6 @@ def maybe_lift(lab, size: int) -> tuple[np.ndarray, int]: for i, (lab, size) in enumerate(zip(labels, shape)): labels[i], lshape[i] = maybe_lift(lab, size) - labels = list(labels) - # Iteratively process all the labels in chunks sized so less # than lib.i8max unique int ids will be required for each chunk while True: @@ -577,7 +575,7 @@ def ensure_key_mapped( if isinstance( values, Index ): # convert to a new Index subclass, not necessarily the same - result = Index(result) + result = Index(result, tupleize_cols=False) else: # try to revert to original type otherwise type_of_values = type(values) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index ef115e350462f..d274c1d7a5aff 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3608,7 +3608,7 @@ def str_extractall(arr, pat, flags: int = 0) -> DataFrame: from pandas import MultiIndex - index = MultiIndex.from_tuples(index_list, names=arr.index.names + ("match",)) + index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) dtype = _result_dtype(arr) result = arr._constructor_expanddim( diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 2aeb1aff07a54..df7a6cdb1ea52 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1124,18 +1124,18 @@ def f(value): # we require at least Ymd required = ["year", "month", "day"] - req = sorted(set(required) - set(unit_rev.keys())) + req = set(required) - set(unit_rev.keys()) if len(req): - _required = ",".join(req) + _required = ",".join(sorted(req)) raise ValueError( "to assemble mappings requires at least that " f"[year, month, day] be specified: [{_required}] is missing" ) # keys we don't recognize - excess = sorted(set(unit_rev.keys()) - set(_unit_map.values())) + excess = set(unit_rev.keys()) - set(_unit_map.values()) if len(excess): - _excess = ",".join(excess) + _excess = ",".join(sorted(excess)) raise ValueError( f"extra keys have been passed to the datetime assemblage: [{_excess}]" ) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 07998cdbd40b5..db6078ae636e3 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -820,12 +820,12 @@ def _apply_pairwise( else: idx_codes, idx_levels = factorize(result.index) result_codes = [idx_codes] - result_levels = [idx_levels] # type: ignore[list-item] + result_levels = [idx_levels] result_names = [result.index.name] - # 3) Create the resulting index by combining 1) + 2) + # 3) Create the resulting index by combining 1) + 2) result_codes = groupby_codes + result_codes - result_levels = groupby_levels + result_levels # type: ignore[assignment] + result_levels = groupby_levels + result_levels result_names = self._grouper.names + result_names result_index = MultiIndex( diff --git a/pandas/io/common.py b/pandas/io/common.py index 35c3a24d8e8f6..4507a7d08c8ba 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -361,6 +361,16 @@ def _get_filepath_or_buffer( stacklevel=find_stack_level(), ) + if "a" in mode and compression_method in ["zip", "tar"]: + # GH56778 + warnings.warn( + "zip and tar do not support mode 'a' properly. " + "This combination will result in multiple files with same name " + "being added to the archive.", + RuntimeWarning, + stacklevel=find_stack_level(), + ) + # Use binary mode when converting path-like objects to file-like objects (fsspec) # except when text mode is explicitly requested. The original mode is returned if # fsspec is not used. diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index a9da95054b81a..2b35cfa044ae9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -979,6 +979,12 @@ class ExcelWriter(Generic[_WorkbookT]): .. versionadded:: 1.3.0 + See Also + -------- + read_excel : Read an Excel sheet values (xlsx) file into DataFrame. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + Notes ----- For compatibility with CSV writers, ExcelWriter serializes lists @@ -1434,6 +1440,7 @@ def inspect_excel_format( return "zip" +@doc(storage_options=_shared_docs["storage_options"]) class ExcelFile: """ Class for parsing tabular Excel sheets into DataFrame objects. @@ -1472,19 +1479,27 @@ class ExcelFile: - Otherwise if ``path_or_buffer`` is in xlsb format, `pyxlsb `_ will be used. - .. versionadded:: 1.3.0 + .. versionadded:: 1.3.0 - Otherwise if `openpyxl `_ is installed, then ``openpyxl`` will be used. - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. - .. warning:: + .. warning:: - Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. - This is not supported, switch to using ``openpyxl`` instead. + Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. + This is not supported, switch to using ``openpyxl`` instead. + {storage_options} engine_kwargs : dict, optional Arbitrary keyword arguments passed to excel engine. + See Also + -------- + DataFrame.to_excel : Write DataFrame to an Excel file. + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + Examples -------- >>> file = pd.ExcelFile("myfile.xlsx") # doctest: +SKIP @@ -1595,11 +1610,134 @@ def parse( Equivalent to read_excel(ExcelFile, ...) See the read_excel docstring for more info on accepted parameters. + Parameters + ---------- + sheet_name : str, int, list, or None, default 0 + Strings are used for sheet names. Integers are used in zero-indexed + sheet positions (chart sheets do not count as a sheet position). + Lists of strings/integers are used to request multiple sheets. + Specify ``None`` to get all worksheets. + header : int, list of int, default 0 + Row (0-indexed) to use for the column labels of the parsed + DataFrame. If a list of integers is passed those row positions will + be combined into a ``MultiIndex``. Use None if there is no header. + names : array-like, default None + List of column names to use. If file contains no header row, + then you should explicitly pass header=None. + index_col : int, str, list of int, default None + Column (0-indexed) to use as the row labels of the DataFrame. + Pass None if there is no such column. If a list is passed, + those columns will be combined into a ``MultiIndex``. If a + subset of data is selected with ``usecols``, index_col + is based on the subset. + + Missing values will be forward filled to allow roundtripping with + ``to_excel`` for ``merged_cells=True``. To avoid forward filling the + missing values use ``set_index`` after reading the data instead of + ``index_col``. + usecols : str, list-like, or callable, default None + * If None, then parse all columns. + * If str, then indicates comma separated list of Excel column letters + and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of + both sides. + * If list of int, then indicates list of column numbers to be parsed + (0-indexed). + * If list of string, then indicates list of column names to be parsed. + * If callable, then evaluate each column name against it and parse the + column if the callable returns ``True``. + + Returns a subset of the columns according to behavior above. + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. + true_values : list, default None + Values to consider as True. + false_values : list, default None + Values to consider as False. + skiprows : list-like, int, or callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (int) at the + start of the file. If callable, the callable function will be evaluated + against the row indices, returning True if the row should be skipped and + False otherwise. An example of a valid callable argument would be ``lambda + x: x in [0, 2]``. + nrows : int, default None + Number of rows to parse. + na_values : scalar, str, list-like, or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. + parse_dates : bool, list-like, or dict, default False + The behavior is as follows: + + * ``bool``. If True -> try parsing the index. + * ``list`` of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * ``list`` of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and + parse as a single date column. + * ``dict``, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call + result 'foo' + + If a column or index contains an unparsable date, the entire column or + index will be returned unaltered as an object data type. If you + don`t want to parse some cells as date just change their type + in Excel to "Text".For non-standard datetime parsing, use + ``pd.to_datetime`` after ``pd.read_excel``. + + Note: A fast-path exists for iso8601-formatted dates. + date_parser : function, optional + Function to use for converting a sequence of string columns to an array of + datetime instances. The default uses ``dateutil.parser.parser`` to do the + conversion. Pandas will try to call `date_parser` in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by `parse_dates` into a single array + and pass that; and 3) call `date_parser` once for each row using one or + more strings (corresponding to the columns defined by `parse_dates`) as + arguments. + + .. deprecated:: 2.0.0 + Use ``date_format`` instead, or read in as ``object`` and then apply + :func:`to_datetime` as-needed. + date_format : str or dict of column -> format, default ``None`` + If used in conjunction with ``parse_dates``, will parse dates + according to this format. For anything more complex, + please read in as ``object`` and then apply :func:`to_datetime` as-needed. + thousands : str, default None + Thousands separator for parsing string columns to numeric. Note that + this parameter is only necessary for columns stored as TEXT in Excel, + any numeric columns will automatically be parsed, regardless of display + format. + comment : str, default None + Comments out remainder of line. Pass a character or characters to this + argument to indicate comments in the input file. Any data between the + comment string and the end of the current line is ignored. + skipfooter : int, default 0 + Rows at the end to skip (0-indexed). + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + **kwds : dict, optional + Arbitrary keyword arguments passed to excel engine. + Returns ------- DataFrame or dict of DataFrames DataFrame from the passed in Excel file. + See Also + -------- + read_excel : Read an Excel sheet values (xlsx) file into DataFrame. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + Examples -------- >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index ab5f1c039b7ca..b2b0d711c6b54 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -3038,7 +3038,7 @@ def set_properties(self, subset: Subset | None = None, **kwargs) -> Styler: return self.map(lambda x: values, subset=subset) @Substitution(subset=subset_args) - def bar( # pylint: disable=disallowed-name + def bar( self, subset: Subset | None = None, axis: Axis | None = 0, diff --git a/pandas/io/html.py b/pandas/io/html.py index b4f6a5508726b..42f5266e7649b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -584,14 +584,8 @@ class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): :class:`pandas.io.html._HtmlFrameParser`. """ - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - from bs4 import SoupStrainer - - self._strainer = SoupStrainer("table") - def _parse_tables(self, document, match, attrs): - element_name = self._strainer.name + element_name = "table" tables = document.find_all(element_name, attrs=attrs) if not tables: raise ValueError("No tables found") diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index ef717dd9b7ef8..7d3eefae39679 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -289,10 +289,10 @@ def json_normalize( meta : list of paths (str or list of str), default None Fields to use as metadata for each record in resulting table. meta_prefix : str, default None - If True, prefix records with dotted (?) path, e.g. foo.bar.field if + If True, prefix records with dotted path, e.g. foo.bar.field if meta is ['foo', 'bar']. record_prefix : str, default None - If True, prefix records with dotted (?) path, e.g. foo.bar.field if + If True, prefix records with dotted path, e.g. foo.bar.field if path to records is ['foo', 'bar']. errors : {'raise', 'ignore'}, default 'raise' Configures error handling. diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 5a7d117b0543e..510097aed2a25 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -487,6 +487,8 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: col_na_values, col_na_fvalues = _get_na_values( col_name, self.na_values, self.na_fvalues, self.keep_default_na ) + else: + col_na_values, col_na_fvalues = set(), set() clean_dtypes = self._clean_mapping(self.dtype) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 44210b6979827..e2456b165fe60 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -354,14 +354,15 @@ def _convert_data( if isinstance(self.na_values, dict): for col in self.na_values: - na_value = self.na_values[col] - na_fvalue = self.na_fvalues[col] + if col is not None: + na_value = self.na_values[col] + na_fvalue = self.na_fvalues[col] - if isinstance(col, int) and col not in self.orig_names: - col = self.orig_names[col] + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] - clean_na_values[col] = na_value - clean_na_fvalues[col] = na_fvalue + clean_na_values[col] = na_value + clean_na_fvalues[col] = na_fvalue else: clean_na_values = self.na_values clean_na_fvalues = self.na_fvalues diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 7ecd8cd6d5012..70f9a68244164 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -194,6 +194,12 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): parameter ignores commented lines and empty lines if ``skip_blank_lines=True``, so ``header=0`` denotes the first line of data rather than the first line of the file. + + When inferred from the file contents, headers are kept distinct from + each other by renaming duplicate names with a numeric suffix of the form + ``".{{count}}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. + Empty headers are named ``"Unnamed: {{i}}"`` or ``"Unnamed: {{i}}_level_{{level}}"`` + in the case of MultiIndex columns. names : Sequence of Hashable, optional Sequence of column labels to apply. If the file contains a header row, then you should explicitly pass ``header=0`` to override the column names. diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5ecf7e287ea58..3cfd740a51304 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -292,14 +292,14 @@ def to_hdf( dropna=dropna, ) - path_or_buf = stringify_path(path_or_buf) - if isinstance(path_or_buf, str): + if isinstance(path_or_buf, HDFStore): + f(path_or_buf) + else: + path_or_buf = stringify_path(path_or_buf) with HDFStore( path_or_buf, mode=mode, complevel=complevel, complib=complib ) as store: f(store) - else: - f(path_or_buf) def read_hdf( diff --git a/pandas/io/sql.py b/pandas/io/sql.py index aa9d0d88ae69a..8c4c4bac884e5 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -473,8 +473,9 @@ def read_sql_query( -------- >>> from sqlalchemy import create_engine # doctest: +SKIP >>> engine = create_engine("sqlite:///database.db") # doctest: +SKIP + >>> sql_query = "SELECT int_column FROM test_data" # doctest: +SKIP >>> with engine.connect() as conn, conn.begin(): # doctest: +SKIP - ... data = pd.read_sql_table("data", conn) # doctest: +SKIP + ... data = pd.read_sql_query(sql_query, conn) # doctest: +SKIP """ check_dtype_backend(dtype_backend) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 3ec077806d6c4..47d879c022ee6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1122,6 +1122,7 @@ def __init__( # State variables for the file self._close_file: Callable[[], None] | None = None self._column_selector_set = False + self._value_label_dict: dict[str, dict[int, str]] = {} self._value_labels_read = False self._dtype: np.dtype | None = None self._lines_read = 0 @@ -1376,7 +1377,7 @@ def _get_time_stamp(self) -> str: elif self._format_version > 104: return self._decode(self._path_or_buf.read(18)) else: - raise ValueError() + raise ValueError def _get_seek_variable_labels(self) -> int: if self._format_version == 117: @@ -1388,7 +1389,7 @@ def _get_seek_variable_labels(self) -> int: elif self._format_version >= 118: return self._read_int64() + 17 else: - raise ValueError() + raise ValueError def _read_old_header(self, first_char: bytes) -> None: self._format_version = int(first_char[0]) @@ -1502,17 +1503,8 @@ def _decode(self, s: bytes) -> str: ) return s.decode("latin-1") - def _read_value_labels(self) -> None: - self._ensure_open() - if self._value_labels_read: - # Don't read twice - return - if self._format_version <= 108: - # Value labels are not supported in version 108 and earlier. - self._value_labels_read = True - self._value_label_dict: dict[str, dict[float, str]] = {} - return - + def _read_new_value_labels(self) -> None: + """Reads value labels with variable length strings (108 and later format)""" if self._format_version >= 117: self._path_or_buf.seek(self._seek_value_labels) else: @@ -1520,9 +1512,6 @@ def _read_value_labels(self) -> None: offset = self._nobs * self._dtype.itemsize self._path_or_buf.seek(self._data_location + offset) - self._value_labels_read = True - self._value_label_dict = {} - while True: if self._format_version >= 117: if self._path_or_buf.read(5) == b" @@ -1530,8 +1519,10 @@ def _read_value_labels(self) -> None: slength = self._path_or_buf.read(4) if not slength: - break # end of value label table (format < 117) - if self._format_version <= 117: + break # end of value label table (format < 117), or end-of-file + if self._format_version == 108: + labname = self._decode(self._path_or_buf.read(9)) + elif self._format_version <= 117: labname = self._decode(self._path_or_buf.read(33)) else: labname = self._decode(self._path_or_buf.read(129)) @@ -1555,8 +1546,45 @@ def _read_value_labels(self) -> None: self._value_label_dict[labname][val[i]] = self._decode( txt[off[i] : end] ) + if self._format_version >= 117: self._path_or_buf.read(6) # + + def _read_old_value_labels(self) -> None: + """Reads value labels with fixed-length strings (105 and earlier format)""" + assert self._dtype is not None + offset = self._nobs * self._dtype.itemsize + self._path_or_buf.seek(self._data_location + offset) + + while True: + if not self._path_or_buf.read(2): + # end-of-file may have been reached, if so stop here + break + + # otherwise back up and read again, taking byteorder into account + self._path_or_buf.seek(-2, os.SEEK_CUR) + n = self._read_uint16() + labname = self._decode(self._path_or_buf.read(9)) + self._path_or_buf.read(1) # padding + codes = np.frombuffer( + self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n + ) + self._value_label_dict[labname] = {} + for i in range(n): + self._value_label_dict[labname][codes[i]] = self._decode( + self._path_or_buf.read(8) + ) + + def _read_value_labels(self) -> None: + self._ensure_open() + if self._value_labels_read: + # Don't read twice + return + + if self._format_version >= 108: + self._read_new_value_labels() + else: + self._read_old_value_labels() self._value_labels_read = True def _read_strls(self) -> None: @@ -1729,7 +1757,7 @@ def read( i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt) ) - if convert_categoricals and self._format_version > 108: + if convert_categoricals: data = self._do_convert_categoricals( data, self._value_label_dict, self._lbllist, order_categoricals ) @@ -1845,7 +1873,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra def _do_convert_categoricals( self, data: DataFrame, - value_label_dict: dict[str, dict[float, str]], + value_label_dict: dict[str, dict[int, str]], lbllist: Sequence[str], order_categoricals: bool, ) -> DataFrame: @@ -1983,7 +2011,7 @@ def variable_labels(self) -> dict[str, str]: self._ensure_open() return dict(zip(self._varlist, self._variable_labels)) - def value_labels(self) -> dict[str, dict[float, str]]: + def value_labels(self) -> dict[str, dict[int, str]]: """ Return a nested dict associating each variable name to its value and label. diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 763244c5bdf0e..60bb45d3ac1dc 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1187,7 +1187,7 @@ def line( ) @Substitution(kind="bar") @Appender(_bar_or_line_doc) - def bar( # pylint: disable=disallowed-name + def bar( self, x: Hashable | None = None, y: Hashable | None = None, diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index b41e03d87b275..2a28cd94b64e5 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -82,7 +82,7 @@ def __init__(self, data, return_type: str = "axes", **kwargs) -> None: self.return_type = return_type # Do not call LinePlot.__init__ which may fill nan - MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called + MPLPlot.__init__(self, data, **kwargs) if self.subplots: # Disable label ax sharing. Otherwise, all subplots shows last @@ -533,14 +533,14 @@ def boxplot_frame_groupby( ) axes = flatten_axes(axes) - ret = pd.Series(dtype=object) - + data = {} for (key, group), ax in zip(grouped, axes): d = group.boxplot( ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds ) ax.set_title(pprint_thing(key)) - ret.loc[key] = d + data[key] = d + ret = pd.Series(data) maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) else: keys, frames = zip(*grouped) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 700136bca8da7..38a75e741d60e 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -297,7 +297,7 @@ def __init__( def _validate_sharex(sharex: bool | None, ax, by) -> bool: if sharex is None: # if by is defined, subplots are used and sharex should be False - if ax is None and by is None: # pylint: disable=simplifiable-if-statement + if ax is None and by is None: sharex = True else: # if we get an axis, the users should do the visibility diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index d9d1df128d199..ca635386be335 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -78,7 +78,7 @@ def __init__( self.xlabel = kwargs.get("xlabel") self.ylabel = kwargs.get("ylabel") # Do not call LinePlot.__init__ which may fill nan - MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called + MPLPlot.__init__(self, data, **kwargs) self.bins = self._adjust_bins(bins) @@ -236,7 +236,7 @@ def __init__( self, data, bw_method=None, ind=None, *, weights=None, **kwargs ) -> None: # Do not call LinePlot.__init__ which may fill nan - MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called + MPLPlot.__init__(self, data, **kwargs) self.bw_method = bw_method self.ind = ind self.weights = weights diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 82c5c305b574c..0f2a641d13b11 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -248,6 +248,7 @@ class TestApi(Base): "indexers", "interchange", "typing", + "internals", ] allowed_typing = [ "DataFrameGroupBy", @@ -256,6 +257,7 @@ class TestApi(Base): "ExpandingGroupby", "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", + "FrozenList", "JsonReader", "NaTType", "NAType", @@ -395,13 +397,5 @@ def test_util_in_top_level(self): pd.util.foo -def test_pandas_array_alias(): - msg = "PandasArray has been renamed NumpyExtensionArray" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = pd.arrays.PandasArray - - assert res is pd.arrays.NumpyExtensionArray - - def test_set_module(): assert pd.DataFrame.__module__ == "pandas" diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 9f3fee686a056..2501ca6c5e1c4 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -402,7 +402,7 @@ def test_apply_yield_list(float_frame): def test_apply_reduce_Series(float_frame): float_frame.iloc[::2, float_frame.columns.get_loc("A")] = np.nan - expected = float_frame.mean(1) + expected = float_frame.mean(axis=1) result = float_frame.apply(np.mean, axis=1) tm.assert_series_equal(result, expected) @@ -1209,7 +1209,7 @@ def test_agg_multiple_mixed_raises(): ) # sorted index - msg = "does not support reduction" + msg = "does not support operation" with pytest.raises(TypeError, match=msg): mdf.agg(["min", "sum"]) @@ -1309,7 +1309,7 @@ def test_nuiscance_columns(): ) tm.assert_frame_equal(result, expected) - msg = "does not support reduction" + msg = "does not support operation" with pytest.raises(TypeError, match=msg): df.agg("sum") @@ -1317,7 +1317,7 @@ def test_nuiscance_columns(): expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) - msg = "does not support reduction" + msg = "does not support operation" with pytest.raises(TypeError, match=msg): df.agg(["sum"]) diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index e9192dae66a46..50cf0f0ed3e84 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -19,27 +19,18 @@ @pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"]) @pytest.mark.parametrize( - "args,kwds", + "kwds", [ - pytest.param([], {}, id="no_args_or_kwds"), - pytest.param([1], {}, id="axis_from_args"), - pytest.param([], {"axis": 1}, id="axis_from_kwds"), - pytest.param([], {"numeric_only": True}, id="optional_kwds"), - pytest.param([1, True], {"numeric_only": True}, id="args_and_kwds"), + pytest.param({}, id="no_kwds"), + pytest.param({"axis": 1}, id="on_axis"), + pytest.param({"numeric_only": True}, id="func_kwds"), + pytest.param({"axis": 1, "numeric_only": True}, id="axis_and_func_kwds"), ], ) @pytest.mark.parametrize("how", ["agg", "apply"]) -def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how): - if len(args) > 1 and how == "agg": - request.applymarker( - pytest.mark.xfail( - raises=TypeError, - reason="agg/apply signature mismatch - agg passes 2nd " - "argument to func", - ) - ) - result = getattr(float_frame, how)(func, *args, **kwds) - expected = getattr(float_frame, func)(*args, **kwds) +def test_apply_with_string_funcs(request, float_frame, func, kwds, how): + result = getattr(float_frame, how)(func, **kwds) + expected = getattr(float_frame, func)(**kwds) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 857b14e2a2558..1069a9e5aaa90 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -35,13 +35,6 @@ class TestCategoricalConstructors: - def test_fastpath_deprecated(self): - codes = np.array([1, 2, 3]) - dtype = CategoricalDtype(categories=["a", "b", "c", "d"], ordered=False) - msg = "The 'fastpath' keyword in Categorical is deprecated" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - Categorical(codes, dtype=dtype, fastpath=True) - def test_categorical_from_cat_and_dtype_str_preserve_ordered(self): # GH#49309 we should preserve orderedness in `res` cat = Categorical([3, 1], categories=[3, 2, 1], ordered=True) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 8778df832d4d7..dbc6cc7715744 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -374,14 +374,14 @@ def test_numeric_like_ops(self): # min/max) s = df["value_group"] for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]: - msg = f"does not support reduction '{op}'" + msg = f"does not support operation '{op}'" with pytest.raises(TypeError, match=msg): getattr(s, op)(numeric_only=False) def test_numeric_like_ops_series(self): # numpy ops s = Series(Categorical([1, 2, 3, 4])) - with pytest.raises(TypeError, match="does not support reduction 'sum'"): + with pytest.raises(TypeError, match="does not support operation 'sum'"): np.sum(s) @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 883d6ea3959ff..c35e8204f3437 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -52,10 +52,11 @@ def test_set_fill_value(self): arr.fill_value = 2 assert arr.fill_value == 2 - msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "fill_value must be a valid value for the SparseDtype.subtype" + with pytest.raises(ValueError, match=msg): + # GH#53043 arr.fill_value = 3.1 - assert arr.fill_value == 3.1 + assert arr.fill_value == 2 arr.fill_value = np.nan assert np.isnan(arr.fill_value) @@ -64,8 +65,9 @@ def test_set_fill_value(self): arr.fill_value = True assert arr.fill_value is True - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): arr.fill_value = 0 + assert arr.fill_value is True arr.fill_value = np.nan assert np.isnan(arr.fill_value) diff --git a/pandas/tests/arrays/sparse/test_constructors.py b/pandas/tests/arrays/sparse/test_constructors.py index 2831c8abdaf13..012ff1da0d431 100644 --- a/pandas/tests/arrays/sparse/test_constructors.py +++ b/pandas/tests/arrays/sparse/test_constructors.py @@ -144,20 +144,12 @@ def test_constructor_spindex_dtype(self): @pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])]) def test_constructor_spindex_dtype_scalar(self, sparse_index): # scalar input - msg = "Constructing SparseArray with scalar data is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None) - exp = SparseArray([1], dtype=None) - tm.assert_sp_array_equal(arr, exp) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 + msg = "Cannot construct SparseArray from scalar data. Pass a sequence instead" + with pytest.raises(TypeError, match=msg): + SparseArray(data=1, sparse_index=sparse_index, dtype=None) - with tm.assert_produces_warning(FutureWarning, match=msg): - arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) - exp = SparseArray([1], dtype=None) - tm.assert_sp_array_equal(arr, exp) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 + with pytest.raises(TypeError, match=msg): + SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) def test_constructor_spindex_dtype_scalar_broadcasts(self): arr = SparseArray( diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 6f0d41333f2fd..1819744d9a9ae 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -84,7 +84,6 @@ def test_nans_not_equal(): (SparseDtype("float64"), SparseDtype("float32")), (SparseDtype("float64"), SparseDtype("float64", 0)), (SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)), - (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)), (SparseDtype("float64"), np.dtype("float64")), ] diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 971c5bf487104..cfc04b5c91354 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -247,7 +247,7 @@ def test_scalar_from_string(self, arr1d): assert result == arr1d[0] def test_reduce_invalid(self, arr1d): - msg = "does not support reduction 'not a method'" + msg = "does not support operation 'not a method'" with pytest.raises(TypeError, match=msg): arr1d._reduce("not a method") diff --git a/pandas/tests/config/test_config.py b/pandas/tests/config/test_config.py index 5b1d4cde9fb59..aaf6178866ecd 100644 --- a/pandas/tests/config/test_config.py +++ b/pandas/tests/config/test_config.py @@ -227,7 +227,6 @@ def test_validation(self): validator = cf.is_one_of_factory([None, cf.is_callable]) cf.register_option("b", lambda: None, "doc", validator=validator) - # pylint: disable-next=consider-using-f-string cf.set_option("b", "%.1f".format) # Formatter is callable cf.set_option("b", None) # Formatter is none (default) with pytest.raises(ValueError, match="Value must be a callable"): diff --git a/pandas/tests/config/test_localization.py b/pandas/tests/config/test_localization.py index 844f67cd2d0ea..b9a0a44bf8c89 100644 --- a/pandas/tests/config/test_localization.py +++ b/pandas/tests/config/test_localization.py @@ -92,7 +92,7 @@ def test_can_set_locale_invalid_get(monkeypatch): # but a subsequent getlocale() raises a ValueError. def mock_get_locale(): - raise ValueError() + raise ValueError with monkeypatch.context() as m: m.setattr(locale, "getlocale", mock_get_locale) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 96a67591f6c78..668e7192c0e52 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -240,8 +240,6 @@ def test_is_list_like_generic(): # is_list_like was yielding false positives for Generic classes in python 3.11 T = TypeVar("T") - # https://github.com/pylint-dev/pylint/issues/9398 - # pylint: disable=multiple-statements class MyDataFrame(DataFrame, Generic[T]): ... tstc = MyDataFrame[int] diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index dcbbac44d083a..bab8566a06dc2 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -165,7 +165,7 @@ def test_in_numeric_groupby(self, data_for_grouping): # period "does not support sum operations", # datetime - "does not support operation: 'sum'", + "does not support operation 'sum'", # all others re.escape(f"agg function failed [how->sum,dtype->{dtype}"), ] diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 26638c6160b7b..225a3301b8b8c 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -191,10 +191,10 @@ def test_argmax_argmin_no_skipna_notimplemented(self, data_missing_for_sorting): # GH#38733 data = data_missing_for_sorting - with pytest.raises(NotImplementedError, match=""): + with pytest.raises(ValueError, match="Encountered an NA value"): data.argmin(skipna=False) - with pytest.raises(NotImplementedError, match=""): + with pytest.raises(ValueError, match="Encountered an NA value"): data.argmax(skipna=False) @pytest.mark.parametrize( diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 328c6cd6164fb..4b9234a9904a2 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -68,6 +68,12 @@ def test_fillna_scalar(self, data_missing): expected = data_missing.fillna(valid) tm.assert_extension_array_equal(result, expected) + def test_fillna_with_none(self, data_missing): + # GH#57723 + result = data_missing.fillna(None) + expected = data_missing + tm.assert_extension_array_equal(result, expected) + def test_fillna_limit_pad(self, data_missing): arr = data_missing.take([1, 0, 0, 0, 1]) result = pd.Series(arr).ffill(limit=2) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 03952d87f0ac6..c3a6daee2dd54 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -86,7 +86,7 @@ def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): # TODO: the message being checked here isn't actually checking anything msg = ( "[Cc]annot perform|Categorical is not ordered for operation|" - "does not support reduction|" + "does not support operation|" ) with pytest.raises(TypeError, match=msg): @@ -105,7 +105,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): # TODO: the message being checked here isn't actually checking anything msg = ( "[Cc]annot perform|Categorical is not ordered for operation|" - "does not support reduction|" + "does not support operation|" ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index a2721908e858f..504bafc145108 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -144,6 +144,14 @@ def test_fillna_series(self, data_missing): ): super().test_fillna_series(data_missing) + def test_fillna_with_none(self, data_missing): + # GH#57723 + # EAs that don't have special logic for None will raise, unlike pandas' + # which interpret None as the NA value for the dtype. + msg = "conversion from NoneType to Decimal is not supported" + with pytest.raises(TypeError, match=msg): + super().test_fillna_with_none(data_missing) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 6ecbf2063f203..22ac9627f6cda 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -149,6 +149,13 @@ def test_fillna_frame(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" super().test_fillna_frame() + def test_fillna_with_none(self, data_missing): + # GH#57723 + # EAs that don't have special logic for None will raise, unlike pandas' + # which interpret None as the NA value for the dtype. + with pytest.raises(AssertionError): + super().test_fillna_with_none(data_missing) + @pytest.mark.parametrize( "limit_area, input_ilocs, expected_ilocs", [ diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 5de4865feb6f9..a42fa6088d9c8 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -104,7 +104,7 @@ def _supports_reduction(self, obj, op_name: str) -> bool: @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): meth = all_boolean_reductions - msg = f"datetime64 type does not support operation: '{meth}'" + msg = f"datetime64 type does not support operation '{meth}'" with pytest.raises(TypeError, match=msg): super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 49e5c4aff5afe..5a6fe07aa007b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -524,6 +524,16 @@ def test_loc_setitem_boolean_mask_allfalse(self): result.loc[result.b.isna(), "a"] = result.a.copy() tm.assert_frame_equal(result, df) + def test_getitem_slice_empty(self): + df = DataFrame([[1]], columns=MultiIndex.from_product([["A"], ["a"]])) + result = df[:] + + expected = DataFrame([[1]], columns=MultiIndex.from_product([["A"], ["a"]])) + + tm.assert_frame_equal(result, expected) + # Ensure df[:] returns a view of df, not the same object + assert result is not df + def test_getitem_fancy_slice_integers_step(self): df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 658fafd3ea2cc..3f98f49cd1877 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -41,7 +41,7 @@ class TestDataFrameSetItem: def test_setitem_str_subclass(self): # GH#37366 class mystring(str): - pass + __slots__ = () data = ["2020-10-22 01:21:00+00:00"] index = DatetimeIndex(data) diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index 029aa3a5b8f05..c510ef78d03aa 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -36,18 +36,18 @@ def test_basic(self, date_range_frame): dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = df.asof(dates) - assert result.notna().all(1).all() + assert result.notna().all(axis=1).all() lb = df.index[14] ub = df.index[30] dates = list(dates) result = df.asof(dates) - assert result.notna().all(1).all() + assert result.notna().all(axis=1).all() mask = (result.index >= lb) & (result.index < ub) rs = result[mask] - assert (rs == 14).all(1).all() + assert (rs == 14).all(axis=1).all() def test_subset(self, date_range_frame): N = 10 diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 81f66cfd48b0a..e858c123e4dae 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -64,8 +64,8 @@ def test_fillna_datetime(self, datetime_frame): padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] ).all() - msg = "Must specify a fill 'value'" - with pytest.raises(ValueError, match=msg): + msg = r"missing 1 required positional argument: 'value'" + with pytest.raises(TypeError, match=msg): datetime_frame.fillna() @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") @@ -466,7 +466,7 @@ def test_fillna_dict_series(self): # disable this for now with pytest.raises(NotImplementedError, match="column by column"): - df.fillna(df.max(1), axis=1) + df.fillna(df.max(axis=1), axis=1) def test_fillna_dataframe(self): # GH#8377 @@ -779,3 +779,17 @@ def test_ffill_bfill_limit_area(data, expected_data, method, kwargs): expected = DataFrame(expected_data) result = getattr(df, method)(**kwargs) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("test_frame", [True, False]) +@pytest.mark.parametrize("dtype", ["float", "object"]) +def test_fillna_with_none_object(test_frame, dtype): + # GH#57723 + obj = Series([1, np.nan, 3], dtype=dtype) + if test_frame: + obj = obj.to_frame() + result = obj.fillna(value=None) + expected = Series([1, None, 3], dtype=dtype) + if test_frame: + expected = expected.to_frame() + tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_rename_axis.py b/pandas/tests/frame/methods/test_rename_axis.py index 908a3f728c749..dd4a77c6509b8 100644 --- a/pandas/tests/frame/methods/test_rename_axis.py +++ b/pandas/tests/frame/methods/test_rename_axis.py @@ -60,15 +60,15 @@ def test_rename_axis_mapper(self): # Test for renaming index using dict result = df.rename_axis(index={"ll": "foo"}) - assert result.index.names == ("foo", "nn") + assert result.index.names == ["foo", "nn"] # Test for renaming index using a function result = df.rename_axis(index=str.upper, axis=0) - assert result.index.names == ("LL", "NN") + assert result.index.names == ["LL", "NN"] # Test for renaming index providing complete list result = df.rename_axis(index=["foo", "goo"]) - assert result.index.names == ("foo", "goo") + assert result.index.names == ["foo", "goo"] # Test for changing index and columns at same time sdf = df.reset_index().set_index("nn").drop(columns=["ll", "y"]) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 3b9c342f35a71..fb7ba2b7af38a 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1264,13 +1264,8 @@ def test_replace_invalid_to_replace(self): r"Expecting 'to_replace' to be either a scalar, array-like, " r"dict or None, got invalid type.*" ) - msg2 = ( - "DataFrame.replace without 'value' and with non-dict-like " - "'to_replace' is deprecated" - ) with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - df.replace(lambda x: x.strip()) + df.replace(lambda x: x.strip()) @pytest.mark.parametrize("dtype", ["float", "float64", "int64", "Int64", "boolean"]) @pytest.mark.parametrize("value", [np.nan, pd.NA]) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 4fbc84cd1a66c..198cab0e91eab 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -148,7 +148,7 @@ def test_set_index_dst(self): def test_set_index(self, float_string_frame): df = float_string_frame - idx = Index(np.arange(len(df))[::-1]) + idx = Index(np.arange(len(df) - 1, -1, -1, dtype=np.int64)) df = df.set_index(idx) tm.assert_index_equal(df.index, idx) @@ -163,7 +163,7 @@ def test_set_index_names(self): ) df.index.name = "name" - assert df.set_index(df.index).index.names == ("name",) + assert df.set_index(df.index).index.names == ["name"] mi = MultiIndex.from_arrays(df[["A", "B"]].T.values, names=["A", "B"]) mi2 = MultiIndex.from_arrays( @@ -172,7 +172,7 @@ def test_set_index_names(self): df = df.set_index(["A", "B"]) - assert df.set_index(df.index).index.names == ("A", "B") + assert df.set_index(df.index).index.names == ["A", "B"] # Check that set_index isn't converting a MultiIndex into an Index assert isinstance(df.set_index(df.index).index, MultiIndex) @@ -292,7 +292,7 @@ def test_set_index_pass_single_array( # only valid column keys are dropped # since B is always passed as array above, nothing is dropped expected = df.set_index(["B"], drop=False, append=append) - expected.index.names = [index_name] + list(name) if append else name + expected.index.names = [index_name] + name if append else name tm.assert_frame_equal(result, expected) @@ -464,12 +464,12 @@ def test_set_index_datetime(self): df = df.set_index("label", append=True) tm.assert_index_equal(df.index.levels[0], expected) tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) - assert df.index.names == ("datetime", "label") + assert df.index.names == ["datetime", "label"] df = df.swaplevel(0, 1) tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) tm.assert_index_equal(df.index.levels[1], expected) - assert df.index.names == ("label", "datetime") + assert df.index.names == ["label", "datetime"] df = DataFrame(np.random.default_rng(2).random(6)) idx1 = DatetimeIndex( diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index b856a7ff5d26b..c146dcc9c2d71 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -857,7 +857,7 @@ def test_sort_index_level_and_column_label( ) # Get index levels from df_idx - levels = list(df_idx.index.names) + levels = df_idx.index.names # Compute expected by sorting on columns and the setting index expected = df_none.sort_values( @@ -875,7 +875,7 @@ def test_sort_column_level_and_index_label( # GH#14353 # Get levels from df_idx - levels = list(df_idx.index.names) + levels = df_idx.index.names # Compute expected by sorting on axis=0, setting index levels, and then # transposing. For some cases this will result in a frame with diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 5b9ced8d47ed7..66a35c6f486a4 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1405,3 +1405,22 @@ def test_to_csv_categorical_and_interval(self): expected_rows = [",a", '0,"[2020-01-01 00:00:00, 2020-01-02 00:00:00]"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected + + def test_to_csv_warn_when_zip_tar_and_append_mode(self, tmp_path): + # GH57875 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + "zip and tar do not support mode 'a' properly. This combination will " + "result in multiple files with same name being added to the archive" + ) + zip_path = tmp_path / "test.zip" + tar_path = tmp_path / "test.tar" + with tm.assert_produces_warning( + RuntimeWarning, match=msg, raise_on_extra_warnings=False + ): + df.to_csv(zip_path, mode="a") + + with tm.assert_produces_warning( + RuntimeWarning, match=msg, raise_on_extra_warnings=False + ): + df.to_csv(tar_path, mode="a") diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 12d8269b640fc..53476c2f7ce38 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -24,6 +24,7 @@ from pandas._config import using_pyarrow_string_dtype from pandas._libs import lib +from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError from pandas.core.dtypes.common import is_integer_dtype @@ -3052,6 +3053,24 @@ def test_from_dict_with_columns_na_scalar(self): expected = DataFrame({"a": Series([pd.NaT, pd.NaT])}) tm.assert_frame_equal(result, expected) + # TODO: make this not cast to object in pandas 3.0 + @pytest.mark.skipif( + not np_version_gt2, reason="StringDType only available in numpy 2 and above" + ) + @pytest.mark.parametrize( + "data", + [ + {"a": ["a", "b", "c"], "b": [1.0, 2.0, 3.0], "c": ["d", "e", "f"]}, + ], + ) + def test_np_string_array_object_cast(self, data): + from numpy.dtypes import StringDType + + data["a"] = np.array(data["a"], dtype=StringDType()) + res = DataFrame(data) + assert res["a"].dtype == np.object_ + assert (res["a"] == data["a"]).all() + def get1(obj): # TODO: make a helper in tm? if isinstance(obj, Series): diff --git a/pandas/tests/frame/test_cumulative.py b/pandas/tests/frame/test_cumulative.py index d7aad680d389e..ab217e1b1332a 100644 --- a/pandas/tests/frame/test_cumulative.py +++ b/pandas/tests/frame/test_cumulative.py @@ -7,10 +7,12 @@ """ import numpy as np +import pytest from pandas import ( DataFrame, Series, + Timestamp, ) import pandas._testing as tm @@ -81,3 +83,25 @@ def test_cumsum_preserve_dtypes(self): } ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("method", ["cumsum", "cumprod", "cummin", "cummax"]) + @pytest.mark.parametrize("axis", [0, 1]) + def test_numeric_only_flag(self, method, axis): + df = DataFrame( + { + "int": [1, 2, 3], + "bool": [True, False, False], + "string": ["a", "b", "c"], + "float": [1.0, 3.5, 4.0], + "datetime": [ + Timestamp(2018, 1, 1), + Timestamp(2019, 1, 1), + Timestamp(2020, 1, 1), + ], + } + ) + df_numeric_only = df.drop(["string", "datetime"], axis=1) + + result = getattr(df, method)(axis=axis, numeric_only=True) + expected = getattr(df_numeric_only, method)(axis) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index d2e36eb6147e7..94e8e469f21e7 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -188,6 +188,16 @@ def test_eval_object_dtype_binop(self): expected = DataFrame({"a1": ["Y", "N"], "c": [True, False]}) tm.assert_frame_equal(res, expected) + def test_using_numpy(self, engine, parser): + # GH 58041 + skip_if_no_pandas_parser(parser) + df = Series([0.2, 1.5, 2.8], name="a").to_frame() + res = df.eval("@np.floor(a)", engine=engine, parser=parser) + expected = np.floor(df["a"]) + if engine == "numexpr": + expected.name = None # See GH 58069 + tm.assert_series_equal(expected, res) + class TestDataFrameQueryWithMultiIndex: def test_query_with_named_multiindex(self, parser, engine): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index fd3dad37da1f9..8ccd7b2ca83ba 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -773,7 +773,7 @@ def test_operators_timedelta64(self): tm.assert_series_equal(result, expected) # works when only those columns are selected - result = mixed[["A", "B"]].min(1) + result = mixed[["A", "B"]].min(axis=1) expected = Series([timedelta(days=-1)] * 3) tm.assert_series_equal(result, expected) @@ -832,8 +832,8 @@ def test_std_datetime64_with_nat(self, values, skipna, request, unit): def test_sum_corner(self): empty_frame = DataFrame() - axis0 = empty_frame.sum(0) - axis1 = empty_frame.sum(1) + axis0 = empty_frame.sum(axis=0) + axis1 = empty_frame.sum(axis=1) assert isinstance(axis0, Series) assert isinstance(axis1, Series) assert len(axis0) == 0 @@ -967,15 +967,15 @@ def test_sum_object(self, float_frame): def test_sum_bool(self, float_frame): # ensure this works, bug report bools = np.isnan(float_frame) - bools.sum(1) - bools.sum(0) + bools.sum(axis=1) + bools.sum(axis=0) def test_sum_mixed_datetime(self): # GH#30886 df = DataFrame({"A": date_range("2000", periods=4), "B": [1, 2, 3, 4]}).reindex( [2, 3, 4] ) - with pytest.raises(TypeError, match="does not support reduction 'sum'"): + with pytest.raises(TypeError, match="does not support operation 'sum'"): df.sum() def test_mean_corner(self, float_frame, float_string_frame): @@ -990,7 +990,7 @@ def test_mean_corner(self, float_frame, float_string_frame): # take mean of boolean column float_frame["bool"] = float_frame["A"] > 0 - means = float_frame.mean(0) + means = float_frame.mean(axis=0) assert means["bool"] == float_frame["bool"].values.mean() def test_mean_datetimelike(self): @@ -1043,13 +1043,13 @@ def test_mean_extensionarray_numeric_only_true(self): def test_stats_mixed_type(self, float_string_frame): with pytest.raises(TypeError, match="could not convert"): - float_string_frame.std(1) + float_string_frame.std(axis=1) with pytest.raises(TypeError, match="could not convert"): - float_string_frame.var(1) + float_string_frame.var(axis=1) with pytest.raises(TypeError, match="unsupported operand type"): - float_string_frame.mean(1) + float_string_frame.mean(axis=1) with pytest.raises(TypeError, match="could not convert"): - float_string_frame.skew(1) + float_string_frame.skew(axis=1) def test_sum_bools(self): df = DataFrame(index=range(1), columns=range(10)) @@ -1066,7 +1066,7 @@ def test_idxmin(self, float_frame, int_frame, skipna, axis): frame.iloc[15:20, -2:] = np.nan for df in [frame, int_frame]: if (not skipna or axis == 1) and df is not int_frame: - if axis == 1: + if skipna: msg = "Encountered all NA values" else: msg = "Encountered an NA value" @@ -1116,7 +1116,7 @@ def test_idxmax(self, float_frame, int_frame, skipna, axis): frame.iloc[15:20, -2:] = np.nan for df in [frame, int_frame]: if (skipna is False or axis == 1) and df is frame: - if axis == 1: + if skipna: msg = "Encountered all NA values" else: msg = "Encountered an NA value" @@ -1331,11 +1331,11 @@ def test_any_all_extra(self): result = df[["A", "B"]].any(axis=1, bool_only=True) tm.assert_series_equal(result, expected) - result = df.all(1) + result = df.all(axis=1) expected = Series([True, False, False], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) - result = df.all(1, bool_only=True) + result = df.all(axis=1, bool_only=True) tm.assert_series_equal(result, expected) # Axis is None @@ -1381,7 +1381,7 @@ def test_any_datetime(self): ] df = DataFrame({"A": float_data, "B": datetime_data}) - msg = "datetime64 type does not support operation: 'any'" + msg = "datetime64 type does not support operation 'any'" with pytest.raises(TypeError, match=msg): df.any(axis=1) @@ -1466,18 +1466,18 @@ def test_any_all_np_func(self, func, data, expected): if any(isinstance(x, CategoricalDtype) for x in data.dtypes): with pytest.raises( - TypeError, match="dtype category does not support reduction" + TypeError, match=".* dtype category does not support operation" ): func(data) # method version with pytest.raises( - TypeError, match="dtype category does not support reduction" + TypeError, match=".* dtype category does not support operation" ): getattr(DataFrame(data), func.__name__)(axis=None) if data.dtypes.apply(lambda x: x.kind == "M").any(): # GH#34479 - msg = "datetime64 type does not support operation: '(any|all)'" + msg = "datetime64 type does not support operation '(any|all)'" with pytest.raises(TypeError, match=msg): func(data) @@ -1734,19 +1734,19 @@ def test_any_all_categorical_dtype_nuisance_column(self, all_boolean_reductions) df = ser.to_frame() # Double-check the Series behavior is to raise - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): getattr(ser, all_boolean_reductions)() - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): getattr(np, all_boolean_reductions)(ser) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): getattr(df, all_boolean_reductions)(bool_only=False) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): getattr(df, all_boolean_reductions)(bool_only=None) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): getattr(np, all_boolean_reductions)(df, axis=0) def test_median_categorical_dtype_nuisance_column(self): @@ -1755,22 +1755,22 @@ def test_median_categorical_dtype_nuisance_column(self): ser = df["A"] # Double-check the Series behavior is to raise - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): ser.median() - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): df.median(numeric_only=False) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): df.median() # same thing, but with an additional non-categorical column df["B"] = df["A"].astype(int) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): df.median(numeric_only=False) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): df.median() # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead @@ -1964,7 +1964,7 @@ def test_minmax_extensionarray(method, numeric_only): def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]}) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation"): df.sum() diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 09235f154b188..03db284d892e3 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -805,7 +805,7 @@ def test_unstack_multi_level_cols(self): [[10, 20, 30], [10, 20, 40]], names=["i1", "i2", "i3"] ), ) - assert df.unstack(["i2", "i1"]).columns.names[-2:] == ("i2", "i1") + assert df.unstack(["i2", "i1"]).columns.names[-2:] == ["i2", "i1"] def test_unstack_multi_level_rows_and_cols(self): # PH 28306: Unstack df with multi level cols and rows @@ -1848,7 +1848,7 @@ def test_stack_unstack_preserve_names( unstacked = frame.unstack() assert unstacked.index.name == "first" - assert unstacked.columns.names == ("exp", "second") + assert unstacked.columns.names == ["exp", "second"] restacked = unstacked.stack(future_stack=future_stack) assert restacked.index.names == frame.index.names diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 81676a5d8520a..1d0f491529b56 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -35,15 +35,15 @@ def test_set_axis_name_mi(self, func): columns=MultiIndex.from_tuples([("C", x) for x in list("xyz")]), ) - level_names = ("L1", "L2") + level_names = ["L1", "L2"] result = methodcaller(func, level_names)(df) assert result.index.names == level_names - assert result.columns.names == (None, None) + assert result.columns.names == [None, None] result = methodcaller(func, level_names, axis=1)(df) - assert result.columns.names == level_names - assert result.index.names == (None, None) + assert result.columns.names == ["L1", "L2"] + assert result.index.names == [None, None] def test_nonzero_single_element(self): df = DataFrame([[False, False]]) diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index cbaf064c379ea..7dcdcd96cce51 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -24,9 +24,9 @@ def test_set_axis_name_mi(self, func): result = methodcaller(func, ["L1", "L2"])(ser) assert ser.index.name is None - assert ser.index.names == ("l1", "l2") + assert ser.index.names == ["l1", "l2"] assert result.index.name is None - assert result.index.names == ("L1", "L2") + assert result.index.names, ["L1", "L2"] def test_set_axis_name_raises(self): ser = Series([1]) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 9b825b73c26c0..af0deba138469 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -454,8 +454,5 @@ def test_groupby_quantile_nonmulti_levels_order(): tm.assert_series_equal(result, expected) # We need to check that index levels are not sorted - tm.assert_index_equal( - result.index.levels[0], Index(["B", "A"], dtype=object, name="cat1") - ) - tm.assert_index_equal(result.index.levels[1], Index([0.2, 0.8])) - assert isinstance(result.index.levels, tuple) + expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]]) + tm.assert_equal(result.index.levels, expected_levels) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index a8d359f3206c2..be52b4a591c26 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -108,7 +108,7 @@ def rebuild_index(df): gr = df.groupby(keys, sort=isort) right = gr["3rd"].apply(Series.value_counts, **kwargs) - right.index.names = tuple(list(right.index.names[:-1]) + ["3rd"]) + right.index.names = right.index.names[:-1] + ["3rd"] # https://github.com/pandas-dev/pandas/issues/49909 right = right.rename(name) diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index b5fdf058d1ab0..d2cfa530e7c65 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -183,10 +183,9 @@ def test_frame_consistency(groupby_func): elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): - exclude_expected = {"skipna", "args"} - exclude_result = {"numeric_only"} + exclude_expected = {"axis", "skipna", "args"} elif groupby_func in ("cumprod", "cumsum"): - exclude_expected = {"skipna"} + exclude_expected = {"axis", "skipna", "numeric_only"} elif groupby_func in ("pct_change",): exclude_expected = {"kwargs"} elif groupby_func in ("rank",): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 9bd2c22788fac..1a2589fe94ea5 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -987,7 +987,7 @@ def test_apply_multi_level_name(category): ).set_index(["A", "B"]) result = df.groupby("B", observed=False).apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) - assert df.index.names == ("A", "B") + assert df.index.names == ["A", "B"] def test_groupby_apply_datetime_result_dtypes(using_infer_string): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index be8f5d73fe7e8..54d7895691f3f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -671,7 +671,7 @@ def test_raises_on_nuisance(df): df = df.loc[:, ["A", "C", "D"]] df["E"] = datetime.now() grouped = df.groupby("A") - msg = "datetime64 type does not support operation: 'sum'" + msg = "datetime64 type does not support operation 'sum'" with pytest.raises(TypeError, match=msg): grouped.agg("sum") with pytest.raises(TypeError, match=msg): @@ -1794,7 +1794,7 @@ def get_categorical_invalid_expected(): else: msg = "category type does not support" if op == "skew": - msg = "|".join([msg, "does not support reduction 'skew'"]) + msg = "|".join([msg, "does not support operation 'skew'"]) with pytest.raises(TypeError, match=msg): get_result() diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 9ce7a0818ac02..063b0ce38387f 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1044,7 +1044,6 @@ def test_multi_iter_frame(self, three_group): grouped = df.groupby(["k1", "k2"]) # calling `dict` on a DataFrameGroupBy leads to a TypeError, # we need to use a dictionary comprehension here - # pylint: disable-next=unnecessary-comprehension groups = {key: gp for key, gp in grouped} # noqa: C416 assert len(groups) == 2 diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 7af27d7227035..9301f8d56d9d2 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -241,16 +241,16 @@ def test_groupby_raises_datetime( return klass, msg = { - "all": (TypeError, "datetime64 type does not support operation: 'all'"), - "any": (TypeError, "datetime64 type does not support operation: 'any'"), + "all": (TypeError, "'all' with datetime64 dtypes is no longer supported"), + "any": (TypeError, "'any' with datetime64 dtypes is no longer supported"), "bfill": (None, ""), "corrwith": (TypeError, "cannot perform __mul__ with this index type"), "count": (None, ""), "cumcount": (None, ""), "cummax": (None, ""), "cummin": (None, ""), - "cumprod": (TypeError, "datetime64 type does not support operation: 'cumprod'"), - "cumsum": (TypeError, "datetime64 type does not support operation: 'cumsum'"), + "cumprod": (TypeError, "datetime64 type does not support operation 'cumprod'"), + "cumsum": (TypeError, "datetime64 type does not support operation 'cumsum'"), "diff": (None, ""), "ffill": (None, ""), "fillna": (None, ""), @@ -265,7 +265,7 @@ def test_groupby_raises_datetime( "ngroup": (None, ""), "nunique": (None, ""), "pct_change": (TypeError, "cannot perform __truediv__ with this index type"), - "prod": (TypeError, "datetime64 type does not support operation: 'prod'"), + "prod": (TypeError, "datetime64 type does not support operation 'prod'"), "quantile": (None, ""), "rank": (None, ""), "sem": (None, ""), @@ -275,14 +275,14 @@ def test_groupby_raises_datetime( TypeError, "|".join( [ - r"dtype datetime64\[ns\] does not support reduction", - "datetime64 type does not support operation: 'skew'", + r"dtype datetime64\[ns\] does not support operation", + "datetime64 type does not support operation 'skew'", ] ), ), "std": (None, ""), - "sum": (TypeError, "datetime64 type does not support operation: 'sum"), - "var": (TypeError, "datetime64 type does not support operation: 'var'"), + "sum": (TypeError, "datetime64 type does not support operation 'sum"), + "var": (TypeError, "datetime64 type does not support operation 'var'"), }[groupby_func] if groupby_func == "fillna": @@ -323,7 +323,7 @@ def test_groupby_raises_datetime_np( klass, msg = { np.sum: ( TypeError, - re.escape("datetime64[us] does not support reduction 'sum'"), + re.escape("datetime64[us] does not support operation 'sum'"), ), np.mean: (None, ""), }[groupby_func_np] @@ -417,7 +417,7 @@ def test_groupby_raises_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'mean'", + "'Categorical' .* does not support operation 'mean'", "category dtype does not support aggregation 'mean'", ] ), @@ -426,7 +426,7 @@ def test_groupby_raises_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'median'", + "'Categorical' .* does not support operation 'median'", "category dtype does not support aggregation 'median'", ] ), @@ -445,7 +445,7 @@ def test_groupby_raises_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'sem'", + "'Categorical' .* does not support operation 'sem'", "category dtype does not support aggregation 'sem'", ] ), @@ -456,7 +456,7 @@ def test_groupby_raises_category( TypeError, "|".join( [ - "dtype category does not support reduction 'skew'", + "dtype category does not support operation 'skew'", "category type does not support skew operations", ] ), @@ -465,7 +465,7 @@ def test_groupby_raises_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'std'", + "'Categorical' .* does not support operation 'std'", "category dtype does not support aggregation 'std'", ] ), @@ -475,7 +475,7 @@ def test_groupby_raises_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'var'", + "'Categorical' .* does not support operation 'var'", "category dtype does not support aggregation 'var'", ] ), @@ -519,10 +519,10 @@ def test_groupby_raises_category_np( gb = gb["d"] klass, msg = { - np.sum: (TypeError, "dtype category does not support reduction 'sum'"), + np.sum: (TypeError, "dtype category does not support operation 'sum'"), np.mean: ( TypeError, - "dtype category does not support reduction 'mean'", + "dtype category does not support operation 'mean'", ), }[groupby_func_np] _call_and_check(klass, msg, how, gb, groupby_func_np, ()) @@ -618,7 +618,7 @@ def test_groupby_raises_category_on_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'sem'", + "'Categorical' .* does not support operation 'sem'", "category dtype does not support aggregation 'sem'", ] ), @@ -630,7 +630,7 @@ def test_groupby_raises_category_on_category( "|".join( [ "category type does not support skew operations", - "dtype category does not support reduction 'skew'", + "dtype category does not support operation 'skew'", ] ), ), @@ -638,7 +638,7 @@ def test_groupby_raises_category_on_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'std'", + "'Categorical' .* does not support operation 'std'", "category dtype does not support aggregation 'std'", ] ), @@ -648,7 +648,7 @@ def test_groupby_raises_category_on_category( TypeError, "|".join( [ - "'Categorical' .* does not support reduction 'var'", + "'Categorical' .* does not support operation 'var'", "category dtype does not support aggregation 'var'", ] ), diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 43fcfd1e59670..99d05dd0f26e4 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -135,16 +135,14 @@ def test_date_range_name(self): assert idx.name == "TEST" def test_date_range_invalid_periods(self): - msg = "periods must be a number, got foo" + msg = "periods must be an integer, got foo" with pytest.raises(TypeError, match=msg): date_range(start="1/1/2000", periods="foo", freq="D") def test_date_range_fractional_period(self): - msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" - with tm.assert_produces_warning(FutureWarning, match=msg): - rng = date_range("1/1/2000", periods=10.5) - exp = date_range("1/1/2000", periods=10) - tm.assert_index_equal(rng, exp) + msg = "periods must be an integer" + with pytest.raises(TypeError, match=msg): + date_range("1/1/2000", periods=10.5) @pytest.mark.parametrize( "freq,freq_depr", @@ -1042,7 +1040,7 @@ def test_constructor(self): bdate_range(START, periods=20, freq=BDay()) bdate_range(end=START, periods=20, freq=BDay()) - msg = "periods must be a number, got B" + msg = "periods must be an integer, got B" with pytest.raises(TypeError, match=msg): date_range("2011-1-1", "2012-1-1", "B") @@ -1120,7 +1118,7 @@ def test_constructor(self): bdate_range(START, periods=20, freq=CDay()) bdate_range(end=START, periods=20, freq=CDay()) - msg = "periods must be a number, got C" + msg = "periods must be an integer, got C" with pytest.raises(TypeError, match=msg): date_range("2011-1-1", "2012-1-1", "C") diff --git a/pandas/tests/indexes/datetimes/test_iter.py b/pandas/tests/indexes/datetimes/test_iter.py index a006ed79f27ba..dbd233eed908f 100644 --- a/pandas/tests/indexes/datetimes/test_iter.py +++ b/pandas/tests/indexes/datetimes/test_iter.py @@ -20,7 +20,7 @@ def test_iteration_preserves_nanoseconds(self, tz): ["2018-02-08 15:00:00.168456358", "2018-02-08 15:00:00.168456359"], tz=tz ) for i, ts in enumerate(index): - assert ts == index[i] # pylint: disable=unnecessary-list-index-lookup + assert ts == index[i] def test_iter_readonly(self): # GH#28055 ints_to_pydatetime with readonly array @@ -35,7 +35,7 @@ def test_iteration_preserves_tz(self): for i, ts in enumerate(index): result = ts - expected = index[i] # pylint: disable=unnecessary-list-index-lookup + expected = index[i] assert result == expected def test_iteration_preserves_tz2(self): @@ -45,7 +45,7 @@ def test_iteration_preserves_tz2(self): for i, ts in enumerate(index): result = ts - expected = index[i] # pylint: disable=unnecessary-list-index-lookup + expected = index[i] assert result._repr_base == expected._repr_base assert result == expected @@ -56,7 +56,7 @@ def test_iteration_preserves_tz3(self): ) for i, ts in enumerate(index): result = ts - expected = index[i] # pylint: disable=unnecessary-list-index-lookup + expected = index[i] assert result._repr_base == expected._repr_base assert result == expected diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 7aea481b49221..5252b85ad8d0e 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -236,11 +236,10 @@ def test_interval_dtype(self, start, end, expected): def test_interval_range_fractional_period(self): # float value for periods - expected = interval_range(start=0, periods=10) - msg = "Non-integer 'periods' in pd.date_range, .* pd.interval_range" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = interval_range(start=0, periods=10.5) - tm.assert_index_equal(result, expected) + msg = "periods must be an integer, got 10.5" + ts = Timestamp("2024-03-25") + with pytest.raises(TypeError, match=msg): + interval_range(ts, periods=10.5) def test_constructor_coverage(self): # equivalent timestamp-like start/end @@ -340,7 +339,7 @@ def test_errors(self): interval_range(start=Timedelta("1 day"), end=Timedelta("10 days"), freq=2) # invalid periods - msg = "periods must be a number, got foo" + msg = "periods must be an integer, got foo" with pytest.raises(TypeError, match=msg): interval_range(start=0, periods="foo") diff --git a/pandas/tests/indexes/multi/test_astype.py b/pandas/tests/indexes/multi/test_astype.py index c993f425fa132..29908537fbe59 100644 --- a/pandas/tests/indexes/multi/test_astype.py +++ b/pandas/tests/indexes/multi/test_astype.py @@ -11,7 +11,7 @@ def test_astype(idx): actual = idx.astype("O") tm.assert_copy(actual.levels, expected.levels) tm.assert_copy(actual.codes, expected.codes) - assert actual.names == expected.names + assert actual.names == list(expected.names) with pytest.raises(TypeError, match="^Setting.*dtype.*object"): idx.astype(np.dtype(int)) diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 2b16f2c4c095d..38e0920b7004e 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -27,7 +27,7 @@ def test_constructor_single_level(): assert isinstance(result, MultiIndex) expected = Index(["foo", "bar", "baz", "qux"], name="first") tm.assert_index_equal(result.levels[0], expected) - assert result.names == ("first",) + assert result.names == ["first"] def test_constructor_no_levels(): @@ -277,7 +277,7 @@ def test_from_arrays_empty(): assert isinstance(result, MultiIndex) expected = Index([], name="A") tm.assert_index_equal(result.levels[0], expected) - assert result.names == ("A",) + assert result.names == ["A"] # N levels for N in [2, 3]: @@ -424,7 +424,7 @@ def test_from_product_empty_one_level(): result = MultiIndex.from_product([[]], names=["A"]) expected = Index([], name="A") tm.assert_index_equal(result.levels[0], expected) - assert result.names == ("A",) + assert result.names == ["A"] @pytest.mark.parametrize( @@ -712,7 +712,7 @@ def test_from_frame_dtype_fidelity(): @pytest.mark.parametrize( - "names_in,names_out", [(None, (("L1", "x"), ("L2", "y"))), (["x", "y"], ("x", "y"))] + "names_in,names_out", [(None, [("L1", "x"), ("L2", "y")]), (["x", "y"], ["x", "y"])] ) def test_from_frame_valid_names(names_in, names_out): # GH 22420 @@ -812,13 +812,13 @@ def test_constructor_with_tz(): result = MultiIndex.from_arrays([index, columns]) - assert result.names == ("dt1", "dt2") + assert result.names == ["dt1", "dt2"] tm.assert_index_equal(result.levels[0], index) tm.assert_index_equal(result.levels[1], columns) result = MultiIndex.from_arrays([Series(index), Series(columns)]) - assert result.names == ("dt1", "dt2") + assert result.names == ["dt1", "dt2"] tm.assert_index_equal(result.levels[0], index) tm.assert_index_equal(result.levels[1], columns) diff --git a/pandas/tests/indexes/multi/test_copy.py b/pandas/tests/indexes/multi/test_copy.py index 14d327093500e..2e09a580f9528 100644 --- a/pandas/tests/indexes/multi/test_copy.py +++ b/pandas/tests/indexes/multi/test_copy.py @@ -70,7 +70,7 @@ def test_copy_method(deep): @pytest.mark.parametrize( "kwarg, value", [ - ("names", ("third", "fourth")), + ("names", ["third", "fourth"]), ], ) def test_copy_method_kwargs(deep, kwarg, value): diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 622520f45f904..1bbeedac3fb10 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -112,7 +112,7 @@ def test_duplicate_multiindex_codes(): mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]]) -@pytest.mark.parametrize("names", [("a", "b", "a"), (1, 1, 2), (1, "a", 1)]) +@pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]]) def test_duplicate_level_names(names): # GH18872, GH19029 mi = MultiIndex.from_product([[0, 1]] * 3, names=names) diff --git a/pandas/tests/indexes/multi/test_formats.py b/pandas/tests/indexes/multi/test_formats.py index cc6a33c22503d..6ea42349bd04a 100644 --- a/pandas/tests/indexes/multi/test_formats.py +++ b/pandas/tests/indexes/multi/test_formats.py @@ -56,14 +56,14 @@ def test_repr_max_seq_items_equal_to_n(self, idx): ('baz', 'two'), ('qux', 'one'), ('qux', 'two')], - names=('first', 'second'))""" + names=['first', 'second'])""" assert result == expected def test_repr(self, idx): result = idx[:1].__repr__() expected = """\ MultiIndex([('foo', 'one')], - names=('first', 'second'))""" + names=['first', 'second'])""" assert result == expected result = idx.__repr__() @@ -74,7 +74,7 @@ def test_repr(self, idx): ('baz', 'two'), ('qux', 'one'), ('qux', 'two')], - names=('first', 'second'))""" + names=['first', 'second'])""" assert result == expected with pd.option_context("display.max_seq_items", 5): @@ -85,7 +85,7 @@ def test_repr(self, idx): ... ('qux', 'one'), ('qux', 'two')], - names=('first', 'second'), length=6)""" + names=['first', 'second'], length=6)""" assert result == expected # display.max_seq_items == 1 @@ -94,7 +94,7 @@ def test_repr(self, idx): expected = """\ MultiIndex([... ('qux', 'two')], - names=('first', ...), length=6)""" + names=['first', ...], length=6)""" assert result == expected def test_rjust(self): @@ -105,7 +105,7 @@ def test_rjust(self): result = mi[:1].__repr__() expected = """\ MultiIndex([('a', 9, '2000-01-01 00:00:00')], - names=('a', 'b', 'dti'))""" + names=['a', 'b', 'dti'])""" assert result == expected result = mi[::500].__repr__() @@ -114,7 +114,7 @@ def test_rjust(self): ( 'a', 9, '2000-01-01 00:08:20'), ('abc', 10, '2000-01-01 00:16:40'), ('abc', 10, '2000-01-01 00:25:00')], - names=('a', 'b', 'dti'))""" + names=['a', 'b', 'dti'])""" assert result == expected result = mi.__repr__() @@ -140,7 +140,7 @@ def test_rjust(self): ('abc', 10, '2000-01-01 00:33:17'), ('abc', 10, '2000-01-01 00:33:18'), ('abc', 10, '2000-01-01 00:33:19')], - names=('a', 'b', 'dti'), length=2000)""" + names=['a', 'b', 'dti'], length=2000)""" assert result == expected def test_tuple_width(self): @@ -152,7 +152,7 @@ def test_tuple_width(self): mi = MultiIndex.from_arrays(levels, names=names) result = mi[:1].__repr__() expected = """MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...)], - names=('a', 'b', 'dti_1', 'dti_2', 'dti_3'))""" # noqa: E501 + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])""" # noqa: E501 assert result == expected result = mi[:10].__repr__() @@ -167,7 +167,7 @@ def test_tuple_width(self): ('a', 9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...), ('a', 9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...), ('a', 9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...)], - names=('a', 'b', 'dti_1', 'dti_2', 'dti_3'))""" + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])""" assert result == expected result = mi.__repr__() @@ -193,7 +193,7 @@ def test_tuple_width(self): ('abc', 10, '2000-01-01 00:33:17', '2000-01-01 00:33:17', ...), ('abc', 10, '2000-01-01 00:33:18', '2000-01-01 00:33:18', ...), ('abc', 10, '2000-01-01 00:33:19', '2000-01-01 00:33:19', ...)], - names=('a', 'b', 'dti_1', 'dti_2', 'dti_3'), length=2000)""" + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'], length=2000)""" assert result == expected def test_multiindex_long_element(self): diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index d17b0aae953cd..dd4bba42eda6f 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -101,16 +101,16 @@ def test_get_level_number_out_of_bounds(multiindex_dataframe_random_data): def test_set_name_methods(idx): # so long as these are synonyms, we don't need to test set_names - index_names = ("first", "second") + index_names = ["first", "second"] assert idx.rename == idx.set_names - new_names = tuple(name + "SUFFIX" for name in index_names) + new_names = [name + "SUFFIX" for name in index_names] ind = idx.set_names(new_names) assert idx.names == index_names assert ind.names == new_names msg = "Length of names must match number of levels in MultiIndex" with pytest.raises(ValueError, match=msg): ind.set_names(new_names + new_names) - new_names2 = tuple(name + "SUFFIX2" for name in new_names) + new_names2 = [name + "SUFFIX2" for name in new_names] res = ind.set_names(new_names2, inplace=True) assert res is None assert ind.names == new_names2 @@ -118,11 +118,11 @@ def test_set_name_methods(idx): # set names for specific level (# GH7792) ind = idx.set_names(new_names[0], level=0) assert idx.names == index_names - assert ind.names == (new_names[0], index_names[1]) + assert ind.names == [new_names[0], index_names[1]] res = ind.set_names(new_names2[0], level=0, inplace=True) assert res is None - assert ind.names == (new_names2[0], index_names[1]) + assert ind.names == [new_names2[0], index_names[1]] # set names for multiple levels ind = idx.set_names(new_names, level=[0, 1]) diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index f6d960bd41925..d570e911bf584 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -216,9 +216,7 @@ def test_can_hold_identifiers(idx): def test_metadata_immutable(idx): levels, codes = idx.levels, idx.codes # shouldn't be able to set at either the top level or base level - mutable_regex = re.compile( - "does not support mutable operations|does not support item assignment" - ) + mutable_regex = re.compile("does not support mutable operations") with pytest.raises(TypeError, match=mutable_regex): levels[0] = levels[0] with pytest.raises(TypeError, match=mutable_regex): diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index aff9ebfb1c1e3..45f19b4d70fb9 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -60,20 +60,20 @@ def test_copy_names(): multi_idx1 = multi_idx.copy() assert multi_idx.equals(multi_idx1) - assert multi_idx.names == ("MyName1", "MyName2") - assert multi_idx1.names == ("MyName1", "MyName2") + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx1.names == ["MyName1", "MyName2"] multi_idx2 = multi_idx.copy(names=["NewName1", "NewName2"]) assert multi_idx.equals(multi_idx2) - assert multi_idx.names == ("MyName1", "MyName2") - assert multi_idx2.names == ("NewName1", "NewName2") + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx2.names == ["NewName1", "NewName2"] multi_idx3 = multi_idx.copy(name=["NewName1", "NewName2"]) assert multi_idx.equals(multi_idx3) - assert multi_idx.names == ("MyName1", "MyName2") - assert multi_idx3.names == ("NewName1", "NewName2") + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx3.names == ["NewName1", "NewName2"] # gh-35592 with pytest.raises(ValueError, match="Length of new names must be 2, got 1"): @@ -85,8 +85,8 @@ def test_copy_names(): def test_names(idx): # names are assigned in setup - assert idx.names == ("first", "second") - level_names = tuple(level.name for level in idx.levels) + assert idx.names == ["first", "second"] + level_names = [level.name for level in idx.levels] assert level_names == idx.names # setting bad names on existing diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index d949a390bd97f..d1b4fe8b98760 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -12,13 +12,13 @@ def test_reindex(idx): result, indexer = idx.reindex(list(idx[:4])) assert isinstance(result, MultiIndex) - assert result.names == ("first", "second") + assert result.names == ["first", "second"] assert [level.name for level in result.levels] == ["first", "second"] result, indexer = idx.reindex(list(idx)) assert isinstance(result, MultiIndex) assert indexer is None - assert result.names == ("first", "second") + assert result.names == ["first", "second"] assert [level.name for level in result.levels] == ["first", "second"] @@ -52,27 +52,27 @@ def test_reindex_preserves_names_when_target_is_list_or_ndarray(idx): other_dtype = MultiIndex.from_product([[1, 2], [3, 4]]) # list & ndarray cases - assert idx.reindex([])[0].names == (None, None) - assert idx.reindex(np.array([]))[0].names == (None, None) - assert idx.reindex(target.tolist())[0].names == (None, None) - assert idx.reindex(target.values)[0].names == (None, None) - assert idx.reindex(other_dtype.tolist())[0].names == (None, None) - assert idx.reindex(other_dtype.values)[0].names == (None, None) + assert idx.reindex([])[0].names == [None, None] + assert idx.reindex(np.array([]))[0].names == [None, None] + assert idx.reindex(target.tolist())[0].names == [None, None] + assert idx.reindex(target.values)[0].names == [None, None] + assert idx.reindex(other_dtype.tolist())[0].names == [None, None] + assert idx.reindex(other_dtype.values)[0].names == [None, None] idx.names = ["foo", "bar"] - assert idx.reindex([])[0].names == ("foo", "bar") - assert idx.reindex(np.array([]))[0].names == ("foo", "bar") - assert idx.reindex(target.tolist())[0].names == ("foo", "bar") - assert idx.reindex(target.values)[0].names == ("foo", "bar") - assert idx.reindex(other_dtype.tolist())[0].names == ("foo", "bar") - assert idx.reindex(other_dtype.values)[0].names == ("foo", "bar") + assert idx.reindex([])[0].names == ["foo", "bar"] + assert idx.reindex(np.array([]))[0].names == ["foo", "bar"] + assert idx.reindex(target.tolist())[0].names == ["foo", "bar"] + assert idx.reindex(target.values)[0].names == ["foo", "bar"] + assert idx.reindex(other_dtype.tolist())[0].names == ["foo", "bar"] + assert idx.reindex(other_dtype.values)[0].names == ["foo", "bar"] def test_reindex_lvl_preserves_names_when_target_is_list_or_array(): # GH7774 idx = MultiIndex.from_product([[0, 1], ["a", "b"]], names=["foo", "bar"]) - assert idx.reindex([], level=0)[0].names == ("foo", "bar") - assert idx.reindex([], level=1)[0].names == ("foo", "bar") + assert idx.reindex([], level=0)[0].names == ["foo", "bar"] + assert idx.reindex([], level=1)[0].names == ["foo", "bar"] def test_reindex_lvl_preserves_type_if_target_is_empty_list_or_array( diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index 1bf91a09ee754..06dbb33aadf97 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -23,7 +23,7 @@ def test_insert(idx): exp0 = Index(list(idx.levels[0]) + ["abc"], name="first") tm.assert_index_equal(new_index.levels[0], exp0) - assert new_index.names == ("first", "second") + assert new_index.names == ["first", "second"] exp1 = Index(list(idx.levels[1]) + ["three"], name="second") tm.assert_index_equal(new_index.levels[1], exp1) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 15076b8705bdc..9354984538c58 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -121,7 +121,7 @@ def test_multiindex_symmetric_difference(): idx2 = idx.copy().rename(["A", "B"]) result = idx.symmetric_difference(idx2) - assert result.names == (None, None) + assert result.names == [None, None] def test_empty(idx): diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index a5a678af4aba7..3d21ee8a57716 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -13,6 +13,7 @@ Timestamp, ) import pandas._testing as tm +from pandas.core.indexes.frozen import FrozenList def test_sortlevel(idx): @@ -285,9 +286,8 @@ def test_remove_unused_levels_with_nan(): idx = idx.set_levels(["a", np.nan], level="id1") idx = idx.remove_unused_levels() result = idx.levels - expected = (Index(["a", np.nan], name="id1"), Index([4], name="id2")) - for res, exp in zip(result, expected): - tm.assert_index_equal(res, exp) + expected = FrozenList([["a", np.nan], [4]]) + assert str(result) == str(expected) def test_sort_values_nan(): diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 088fcfcd7d75f..676d33d2b0f81 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -312,7 +312,10 @@ def test_cant_or_shouldnt_cast(self, dtype): def test_view_index(self, simple_index): index = simple_index - msg = "Cannot change data-type for object array" + msg = ( + "Cannot change data-type for array of references.|" + "Cannot change data-type for object array.|" + ) with pytest.raises(TypeError, match=msg): index.view(Index) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 34cc8eab4d812..039836da75cd5 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -7,7 +7,6 @@ NA, is_matching_na, ) -from pandas.compat import pa_version_under16p0 import pandas.util._test_decorators as td import pandas as pd @@ -202,16 +201,7 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype, request): - if ( - not pa_version_under16p0 - and dtype == "string[pyarrow_numpy]" - and in_slice == slice("a", "a", -1) - ): - request.applymarker( - pytest.mark.xfail(reason="https://github.com/apache/arrow/issues/40642") - ) - + def test_slice_locs_negative_step(self, in_slice, expected, dtype): index = Index(list("bcdxy"), dtype=dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index ec2216c102c3f..6aba9f17326ba 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -196,11 +196,9 @@ def test_constructor_invalid_quarters(self): ) def test_period_range_fractional_period(self): - msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = period_range("2007-01", periods=10.5, freq="M") - exp = period_range("2007-01", periods=10, freq="M") - tm.assert_index_equal(result, exp) + msg = "periods must be an integer, got 10.5" + with pytest.raises(TypeError, match=msg): + period_range("2007-01", periods=10.5, freq="M") def test_constructor_with_without_freq(self): # GH53687 diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index fb200d071951e..67f4d7421df23 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -70,7 +70,7 @@ def test_start_end_non_nat(self): def test_periods_requires_integer(self): # invalid periods param - msg = "periods must be a number, got foo" + msg = "periods must be an integer, got foo" with pytest.raises(TypeError, match=msg): period_range(start="2017Q1", periods="foo") diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 8d41efa586411..727edb7ae30ad 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -375,7 +375,10 @@ def test_cant_or_shouldnt_cast(self, start, stop, step): def test_view_index(self, simple_index): index = simple_index - msg = "Cannot change data-type for object array" + msg = ( + "Cannot change data-type for array of references.|" + "Cannot change data-type for object array.|" + ) with pytest.raises(TypeError, match=msg): index.view(Index) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 997276ef544f7..3a2d04d3ffdc2 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -358,7 +358,10 @@ def test_view_with_args_object_array_raises(self, index): with pytest.raises(NotImplementedError, match="i8"): index.view("i8") else: - msg = "Cannot change data-type for object array" + msg = ( + "Cannot change data-type for array of references.|" + "Cannot change data-type for object array.|" + ) with pytest.raises(TypeError, match=msg): index.view("i8") @@ -905,7 +908,7 @@ def test_isin_level_kwarg_bad_level_raises(self, index): @pytest.mark.parametrize("label", [1.0, "foobar", "xyzzy", np.nan]) def test_isin_level_kwarg_bad_label_raises(self, label, index): if isinstance(index, MultiIndex): - index = index.rename(("foo", "bar") + index.names[2:]) + index = index.rename(["foo", "bar"] + index.names[2:]) msg = f"'Level {label} not found'" else: index = index.rename("foo") diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index a2dee61295c74..b6e1c3698c258 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -119,7 +119,7 @@ def test_set_name_methods(self, index_flat): # should return None assert res is None assert index.name == new_name - assert index.names == (new_name,) + assert index.names == [new_name] with pytest.raises(ValueError, match="Level must be None"): index.set_names("a", level=0) @@ -127,7 +127,7 @@ def test_set_name_methods(self, index_flat): name = ("A", "B") index.rename(name, inplace=True) assert index.name == name - assert index.names == (name,) + assert index.names == [name] @pytest.mark.xfail def test_set_names_single_label_no_level(self, index_flat): @@ -479,6 +479,17 @@ def test_sort_values_with_missing(index_with_missing, na_position, request): tm.assert_index_equal(result, expected) +def test_sort_values_natsort_key(): + # GH#56081 + def split_convert(s): + return tuple(int(x) for x in s.split(".")) + + idx = pd.Index(["1.9", "2.0", "1.11", "1.10"]) + expected = pd.Index(["1.9", "1.10", "1.11", "2.0"]) + result = idx.sort_values(key=lambda x: tuple(map(split_convert, x))) + tm.assert_index_equal(result, expected) + + def test_ndarray_compat_properties(index): if isinstance(index, PeriodIndex) and not IS64: pytest.skip("Overflow") diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 0ad5888a44392..e45d11e6286e2 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -88,7 +88,10 @@ def test_view(self, simple_index): result = type(simple_index)(idx) tm.assert_index_equal(result, idx) - msg = "Cannot change data-type for object array" + msg = ( + "Cannot change data-type for array of references.|" + "Cannot change data-type for object array.|" + ) with pytest.raises(TypeError, match=msg): idx.view(type(simple_index)) diff --git a/pandas/tests/indexes/test_frozen.py b/pandas/tests/indexes/test_frozen.py new file mode 100644 index 0000000000000..ace66b5b06a51 --- /dev/null +++ b/pandas/tests/indexes/test_frozen.py @@ -0,0 +1,113 @@ +import re + +import pytest + +from pandas.core.indexes.frozen import FrozenList + + +@pytest.fixture +def lst(): + return [1, 2, 3, 4, 5] + + +@pytest.fixture +def container(lst): + return FrozenList(lst) + + +@pytest.fixture +def unicode_container(): + return FrozenList(["\u05d0", "\u05d1", "c"]) + + +class TestFrozenList: + def check_mutable_error(self, *args, **kwargs): + # Pass whatever function you normally would to pytest.raises + # (after the Exception kind). + mutable_regex = re.compile("does not support mutable operations") + msg = "'(_s)?re.(SRE_)?Pattern' object is not callable" + with pytest.raises(TypeError, match=msg): + mutable_regex(*args, **kwargs) + + def test_no_mutable_funcs(self, container): + def setitem(): + container[0] = 5 + + self.check_mutable_error(setitem) + + def setslice(): + container[1:2] = 3 + + self.check_mutable_error(setslice) + + def delitem(): + del container[0] + + self.check_mutable_error(delitem) + + def delslice(): + del container[0:3] + + self.check_mutable_error(delslice) + + mutable_methods = ("extend", "pop", "remove", "insert") + + for meth in mutable_methods: + self.check_mutable_error(getattr(container, meth)) + + def test_slicing_maintains_type(self, container, lst): + result = container[1:2] + expected = lst[1:2] + self.check_result(result, expected) + + def check_result(self, result, expected): + assert isinstance(result, FrozenList) + assert result == expected + + def test_string_methods_dont_fail(self, container): + repr(container) + str(container) + bytes(container) + + def test_tricky_container(self, unicode_container): + repr(unicode_container) + str(unicode_container) + + def test_add(self, container, lst): + result = container + (1, 2, 3) + expected = FrozenList(lst + [1, 2, 3]) + self.check_result(result, expected) + + result = (1, 2, 3) + container + expected = FrozenList([1, 2, 3] + lst) + self.check_result(result, expected) + + def test_iadd(self, container, lst): + q = r = container + + q += [5] + self.check_result(q, lst + [5]) + + # Other shouldn't be mutated. + self.check_result(r, lst) + + def test_union(self, container, lst): + result = container.union((1, 2, 3)) + expected = FrozenList(lst + [1, 2, 3]) + self.check_result(result, expected) + + def test_difference(self, container): + result = container.difference([2]) + expected = FrozenList([1, 3, 4, 5]) + self.check_result(result, expected) + + def test_difference_dupe(self): + result = FrozenList([1, 2, 3, 2]).difference([2]) + expected = FrozenList([1, 3]) + self.check_result(result, expected) + + def test_tricky_container_to_bytes_raises(self, unicode_container): + # GH 26447 + msg = "^'str' object cannot be interpreted as an integer$" + with pytest.raises(TypeError, match=msg): + bytes(unicode_container) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index f41c6870cdb1c..9b4470021cc1d 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -222,12 +222,7 @@ def test_logical_compat(self, simple_index): assert idx.any() == idx._values.any() assert idx.any() == idx.to_series().any() else: - msg = "cannot perform (any|all)" - if isinstance(idx, IntervalIndex): - msg = ( - r"'IntervalArray' with dtype interval\[.*\] does " - "not support reduction '(any|all)'" - ) + msg = "does not support operation '(any|all)'" with pytest.raises(TypeError, match=msg): idx.all() with pytest.raises(TypeError, match=msg): @@ -885,7 +880,10 @@ def test_view(self, simple_index): idx_view = idx.view(dtype) tm.assert_index_equal(idx, index_cls(idx_view, name="Foo"), exact=True) - msg = "Cannot change data-type for object array" + msg = ( + "Cannot change data-type for array of references.|" + "Cannot change data-type for object array.|" + ) with pytest.raises(TypeError, match=msg): # GH#55709 idx.view(index_cls) diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index 895ea110c8ad5..12ac5dd63bd8c 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -143,14 +143,12 @@ def test_constructor_iso(self): tm.assert_index_equal(result, expected) def test_timedelta_range_fractional_period(self): - msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" - with tm.assert_produces_warning(FutureWarning, match=msg): - rng = timedelta_range("1 days", periods=10.5) - exp = timedelta_range("1 days", periods=10) - tm.assert_index_equal(rng, exp) + msg = "periods must be an integer" + with pytest.raises(TypeError, match=msg): + timedelta_range("1 days", periods=10.5) def test_constructor_coverage(self): - msg = "periods must be a number, got foo" + msg = "periods must be an integer, got foo" with pytest.raises(TypeError, match=msg): timedelta_range(start="1 days", periods="foo", freq="D") diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 78f701fff6e29..dbfabf7666d25 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -150,7 +150,7 @@ def test_getitem_intkey_leading_level( # GH#33355 dont fall-back to positional when leading level is int ymd = multiindex_year_month_day_dataframe_random_data levels = ymd.index.levels - ymd.index = ymd.index.set_levels((levels[0].astype(dtype),) + levels[1:]) + ymd.index = ymd.index.set_levels([levels[0].astype(dtype)] + levels[1:]) ser = ymd["A"] mi = ser.index assert isinstance(mi, MultiIndex) diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index d78694018749c..217ca74bd7fbd 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -136,7 +136,11 @@ def test_at_datetime_index(self, row): class TestAtSetItemWithExpansion: def test_at_setitem_expansion_series_dt64tz_value(self, tz_naive_fixture): # GH#25506 - ts = Timestamp("2017-08-05 00:00:00+0100", tz=tz_naive_fixture) + ts = ( + Timestamp("2017-08-05 00:00:00+0100", tz=tz_naive_fixture) + if tz_naive_fixture is not None + else Timestamp("2017-08-05 00:00:00+0100") + ) result = Series(ts) result.at[1] = ts expected = Series([ts, ts]) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 2a2772d1b3453..b28c3cba7d310 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -284,7 +284,9 @@ def test_detect_chained_assignment_changing_dtype(self): with tm.raises_chained_assignment_error(): df.loc[2]["C"] = "foo" tm.assert_frame_equal(df, df_original) - with tm.raises_chained_assignment_error(extra_warnings=(FutureWarning,)): + with tm.raises_chained_assignment_error( + extra_warnings=(FutureWarning,), extra_match=(None,) + ): df["C"][2] = "foo" tm.assert_frame_equal(df, df_original) diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index 5bff1b7be3080..7ab8988521fdf 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -3,10 +3,14 @@ in core.internals """ +import datetime + +import numpy as np import pytest import pandas as pd import pandas._testing as tm +from pandas.api.internals import create_dataframe_from_blocks from pandas.core import internals from pandas.core.internals import api @@ -71,3 +75,91 @@ def test_create_block_manager_from_blocks_deprecated(): ) with tm.assert_produces_warning(DeprecationWarning, match=msg): internals.create_block_manager_from_blocks + + +def test_create_dataframe_from_blocks(float_frame): + block = float_frame._mgr.blocks[0] + index = float_frame.index.copy() + columns = float_frame.columns.copy() + + result = create_dataframe_from_blocks( + [(block.values, block.mgr_locs.as_array)], index=index, columns=columns + ) + tm.assert_frame_equal(result, float_frame) + + +def test_create_dataframe_from_blocks_types(): + df = pd.DataFrame( + { + "int": list(range(1, 4)), + "uint": np.arange(3, 6).astype("uint8"), + "float": [2.0, np.nan, 3.0], + "bool": np.array([True, False, True]), + "boolean": pd.array([True, False, None], dtype="boolean"), + "string": list("abc"), + "datetime": pd.date_range("20130101", periods=3), + "datetimetz": pd.date_range("20130101", periods=3).tz_localize( + "Europe/Brussels" + ), + "timedelta": pd.timedelta_range("1 day", periods=3), + "period": pd.period_range("2012-01-01", periods=3, freq="D"), + "categorical": pd.Categorical(["a", "b", "a"]), + "interval": pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)]), + } + ) + + result = create_dataframe_from_blocks( + [(block.values, block.mgr_locs.as_array) for block in df._mgr.blocks], + index=df.index, + columns=df.columns, + ) + tm.assert_frame_equal(result, df) + + +def test_create_dataframe_from_blocks_datetimelike(): + # extension dtypes that have an exact matching numpy dtype can also be + # be passed as a numpy array + index, columns = pd.RangeIndex(3), pd.Index(["a", "b", "c", "d"]) + + block_array1 = np.arange( + datetime.datetime(2020, 1, 1), + datetime.datetime(2020, 1, 7), + step=datetime.timedelta(1), + ).reshape((2, 3)) + block_array2 = np.arange( + datetime.timedelta(1), datetime.timedelta(7), step=datetime.timedelta(1) + ).reshape((2, 3)) + result = create_dataframe_from_blocks( + [(block_array1, np.array([0, 2])), (block_array2, np.array([1, 3]))], + index=index, + columns=columns, + ) + expected = pd.DataFrame( + { + "a": pd.date_range("2020-01-01", periods=3, unit="us"), + "b": pd.timedelta_range("1 days", periods=3, unit="us"), + "c": pd.date_range("2020-01-04", periods=3, unit="us"), + "d": pd.timedelta_range("4 days", periods=3, unit="us"), + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "array", + [ + pd.date_range("2020-01-01", periods=3), + pd.date_range("2020-01-01", periods=3, tz="UTC"), + pd.period_range("2012-01-01", periods=3, freq="D"), + pd.timedelta_range("1 day", periods=3), + ], +) +def test_create_dataframe_from_blocks_1dEA(array): + # ExtensionArrays can be passed as 1D even if stored under the hood as 2D + df = pd.DataFrame({"a": array}) + + block = df._mgr.blocks[0] + result = create_dataframe_from_blocks( + [(block.values[0], block.mgr_locs.as_array)], index=df.index, columns=df.columns + ) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/data/stata/stata-compat-be-105.dta b/pandas/tests/io/data/stata/stata-compat-be-105.dta new file mode 100644 index 0000000000000..af75548c840d4 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-105.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-108.dta b/pandas/tests/io/data/stata/stata-compat-be-108.dta new file mode 100644 index 0000000000000..e3e5d85fca4ad Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-108.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-111.dta b/pandas/tests/io/data/stata/stata-compat-be-111.dta new file mode 100644 index 0000000000000..197decdcf0c2d Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-111.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-113.dta b/pandas/tests/io/data/stata/stata-compat-be-113.dta new file mode 100644 index 0000000000000..c69c32106114f Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-113.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-114.dta b/pandas/tests/io/data/stata/stata-compat-be-114.dta new file mode 100644 index 0000000000000..222bdb2b62784 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-114.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-118.dta b/pandas/tests/io/data/stata/stata-compat-be-118.dta new file mode 100644 index 0000000000000..0a5df1b321c2d Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-118.dta differ diff --git a/pandas/tests/io/data/stata/stata10_115.dta b/pandas/tests/io/data/stata/stata10_115.dta index b917dde5ad47d..bdca3b9b340c1 100644 Binary files a/pandas/tests/io/data/stata/stata10_115.dta and b/pandas/tests/io/data/stata/stata10_115.dta differ diff --git a/pandas/tests/io/data/stata/stata4_105.dta b/pandas/tests/io/data/stata/stata4_105.dta new file mode 100644 index 0000000000000..f804c315b344b Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_105.dta differ diff --git a/pandas/tests/io/data/stata/stata4_108.dta b/pandas/tests/io/data/stata/stata4_108.dta new file mode 100644 index 0000000000000..e78c24b319e47 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_108.dta differ diff --git a/pandas/tests/io/data/stata/stata4_111.dta b/pandas/tests/io/data/stata/stata4_111.dta new file mode 100644 index 0000000000000..b69034174fcfe Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_111.dta differ diff --git a/pandas/tests/io/data/stata/stata4_114.dta b/pandas/tests/io/data/stata/stata4_114.dta index c5d7de8b42295..f58cdb215332e 100644 Binary files a/pandas/tests/io/data/stata/stata4_114.dta and b/pandas/tests/io/data/stata/stata4_114.dta differ diff --git a/pandas/tests/io/data/stata/stata9_115.dta b/pandas/tests/io/data/stata/stata9_115.dta index 5ad6cd6a2c8ff..1b5c0042bebbe 100644 Binary files a/pandas/tests/io/data/stata/stata9_115.dta and b/pandas/tests/io/data/stata/stata9_115.dta differ diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py index 2306324efb974..752b9b391f9cb 100644 --- a/pandas/tests/io/formats/style/test_html.py +++ b/pandas/tests/io/formats/style/test_html.py @@ -821,7 +821,6 @@ def test_rendered_links(type, text, exp, found): def test_multiple_rendered_links(): links = ("www.a.b", "http://a.c", "https://a.d", "ftp://a.e") - # pylint: disable-next=consider-using-f-string df = DataFrame(["text {} {} text {} {}".format(*links)]) result = df.style.format(hyperlinks="html").to_html() href = '{0}' diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index afc9974c75e6a..a728f6ec6ca9a 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -115,7 +115,7 @@ def test_multiindex(self, df_schema, using_infer_string): {"name": "C", "type": "datetime"}, {"name": "D", "type": "duration"}, ], - "primaryKey": ("level_0", "level_1"), + "primaryKey": ["level_0", "level_1"], } if using_infer_string: expected["fields"][0] = { @@ -128,7 +128,7 @@ def test_multiindex(self, df_schema, using_infer_string): df.index.names = ["idx0", None] expected["fields"][0]["name"] = "idx0" - expected["primaryKey"] = ("idx0", "level_1") + expected["primaryKey"] = ["idx0", "level_1"] result = build_table_schema(df, version=False) assert result == expected @@ -598,21 +598,21 @@ def test_categorical(self): (pd.Index([1], name="myname"), "myname", "name"), ( pd.MultiIndex.from_product([("a", "b"), ("c", "d")]), - ("level_0", "level_1"), + ["level_0", "level_1"], "names", ), ( pd.MultiIndex.from_product( [("a", "b"), ("c", "d")], names=["n1", "n2"] ), - ("n1", "n2"), + ["n1", "n2"], "names", ), ( pd.MultiIndex.from_product( [("a", "b"), ("c", "d")], names=["n1", None] ), - ("n1", "level_1"), + ["n1", "level_1"], "names", ), ], diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 0827f64dccf46..bd47e045417ce 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -196,7 +196,6 @@ def test_warn_bad_lines(all_parsers): expected_warning = ParserWarning if parser.engine == "pyarrow": match_msg = "Expected 1 columns, but found 3: 1,2,3" - expected_warning = (ParserWarning, DeprecationWarning) with tm.assert_produces_warning( expected_warning, match=match_msg, check_stacklevel=False @@ -315,7 +314,6 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers): expected_warning = ParserWarning if parser.engine == "pyarrow": match_msg = "Expected 2 columns, but found 3: a,b,c" - expected_warning = (ParserWarning, DeprecationWarning) with tm.assert_produces_warning( expected_warning, match=match_msg, check_stacklevel=False diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index ba0e3033321e4..1e370f649aef8 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -532,6 +532,47 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) +def test_na_values_dict_null_column_name(all_parsers): + # see gh-57547 + parser = all_parsers + data = ",x,y\n\nMA,1,2\nNA,2,1\nOA,,3" + names = [None, "x", "y"] + na_values = {name: STR_NA_VALUES for name in names} + dtype = {None: "object", "x": "float64", "y": "float64"} + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + index_col=0, + header=0, + dtype=dtype, + names=names, + na_values=na_values, + keep_default_na=False, + ) + return + + expected = DataFrame( + {None: ["MA", "NA", "OA"], "x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]} + ) + + expected = expected.set_index(None) + + result = parser.read_csv( + StringIO(data), + index_col=0, + header=0, + dtype=dtype, + names=names, + na_values=na_values, + keep_default_na=False, + ) + + tm.assert_frame_equal(result, expected) + + def test_na_values_dict_col_index(all_parsers): # see gh-14203 data = "a\nfoo\n1" diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 0bc0c3e744db7..8968948df5fa9 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -343,7 +343,7 @@ def test_multiple_date_col(all_parsers, keep_date_col, request): "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], } with tm.assert_produces_warning( - (DeprecationWarning, FutureWarning), match=depr_msg, check_stacklevel=False + FutureWarning, match=depr_msg, check_stacklevel=False ): result = parser.read_csv(StringIO(data), **kwds) @@ -724,7 +724,7 @@ def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): ) with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + FutureWarning, match=depr_msg, check_stacklevel=False ): parser.read_csv(StringIO(data), parse_dates=parse_dates) @@ -1248,14 +1248,14 @@ def test_multiple_date_col_named_index_compat(all_parsers): "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + FutureWarning, match=depr_msg, check_stacklevel=False ): with_indices = parser.read_csv( StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" ) with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + FutureWarning, match=depr_msg, check_stacklevel=False ): with_names = parser.read_csv( StringIO(data), @@ -1280,13 +1280,13 @@ def test_multiple_date_col_multiple_index_compat(all_parsers): "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + FutureWarning, match=depr_msg, check_stacklevel=False ): result = parser.read_csv( StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} ) with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + FutureWarning, match=depr_msg, check_stacklevel=False ): expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) @@ -2267,7 +2267,7 @@ def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates): "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + FutureWarning, match=depr_msg, check_stacklevel=False ): result = parser.read_csv( StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 8d4c28bd61fa1..44a55cf3be240 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -155,12 +155,8 @@ def test_pyarrow_engine(self): kwargs[default] = "warn" warn = None - depr_msg = None + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" if "delim_whitespace" in kwargs: - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - warn = FutureWarning - if "verbose" in kwargs: - depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated" warn = FutureWarning with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index 75efe87c408c0..ab98857e0c178 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -146,7 +146,7 @@ def test_usecols_with_parse_dates4(all_parsers): "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + FutureWarning, match=depr_msg, check_stacklevel=False ): result = parser.read_csv( StringIO(data), @@ -187,7 +187,7 @@ def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) with tm.assert_produces_warning( - (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + FutureWarning, match=depr_msg, check_stacklevel=False ): result = parser.read_csv( StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index e62df0bc1c977..471f7b8958ee4 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -1024,7 +1024,7 @@ def test_columns_multiindex_modified(tmp_path, setup_path): df.index.name = "letters" df = df.set_index(keys="E", append=True) - data_columns = list(df.index.names) + df.columns.tolist() + data_columns = df.index.names + df.columns.tolist() path = tmp_path / setup_path df.to_hdf( path, diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 2c0f19dc74ed2..f16f3a2a5c775 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -152,7 +152,6 @@ def test_to_html_compat(self, flavor_read_html): np.random.default_rng(2).random((4, 3)), columns=pd.Index(list("abc"), dtype=object), ) - # pylint: disable-next=consider-using-f-string .map("{:.3f}".format) .astype(float) ) @@ -1460,7 +1459,6 @@ def seek(self, offset): def seekable(self): return True - # GH 49036 pylint checks for presence of __next__ for iterators def __next__(self): ... def __iter__(self) -> Iterator: diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 67b1311a5a798..3083fa24ba8b5 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2346,18 +2346,15 @@ def test_read_table_index_col(conn, request, test_frame1): sql.to_sql(test_frame1, "test_frame", conn) result = sql.read_sql_table("test_frame", conn, index_col="index") - assert result.index.names == ("index",) + assert result.index.names == ["index"] result = sql.read_sql_table("test_frame", conn, index_col=["A", "B"]) - assert result.index.names == ("A", "B") + assert result.index.names == ["A", "B"] result = sql.read_sql_table( "test_frame", conn, index_col=["A", "B"], columns=["C", "D"] ) - assert result.index.names == ( - "A", - "B", - ) + assert result.index.names == ["A", "B"] assert result.columns.tolist() == ["C", "D"] diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 9078ca865042d..43c62237c6786 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -63,16 +63,16 @@ def read_csv(self, file): return read_csv(file, parse_dates=True) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_empty_dta(self, version): + def test_read_empty_dta(self, version, temp_file): empty_ds = DataFrame(columns=["unit"]) # GH 7369, make sure can read a 0-obs dta file - with tm.ensure_clean() as path: - empty_ds.to_stata(path, write_index=False, version=version) - empty_ds2 = read_stata(path) - tm.assert_frame_equal(empty_ds, empty_ds2) + path = temp_file + empty_ds.to_stata(path, write_index=False, version=version) + empty_ds2 = read_stata(path) + tm.assert_frame_equal(empty_ds, empty_ds2) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_empty_dta_with_dtypes(self, version): + def test_read_empty_dta_with_dtypes(self, version, temp_file): # GH 46240 # Fixing above bug revealed that types are not correctly preserved when # writing empty DataFrames @@ -91,9 +91,9 @@ def test_read_empty_dta_with_dtypes(self, version): } ) # GH 7369, make sure can read a 0-obs dta file - with tm.ensure_clean() as path: - empty_df_typed.to_stata(path, write_index=False, version=version) - empty_reread = read_stata(path) + path = temp_file + empty_df_typed.to_stata(path, write_index=False, version=version) + empty_reread = read_stata(path) expected = empty_df_typed # No uint# support. Downcast since values in range for int# @@ -108,12 +108,12 @@ def test_read_empty_dta_with_dtypes(self, version): tm.assert_series_equal(expected.dtypes, empty_reread.dtypes) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_index_col_none(self, version): + def test_read_index_col_none(self, version, temp_file): df = DataFrame({"a": range(5), "b": ["b1", "b2", "b3", "b4", "b5"]}) # GH 7369, make sure can read a 0-obs dta file - with tm.ensure_clean() as path: - df.to_stata(path, write_index=False, version=version) - read_df = read_stata(path) + path = temp_file + df.to_stata(path, write_index=False, version=version) + read_df = read_stata(path) assert isinstance(read_df.index, pd.RangeIndex) expected = df @@ -225,7 +225,7 @@ def test_read_dta3(self, file, datapath): tm.assert_frame_equal(parsed, expected) @pytest.mark.parametrize( - "file", ["stata4_113", "stata4_114", "stata4_115", "stata4_117"] + "file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"] ) def test_read_dta4(self, file, datapath): file = datapath("io", "data", "stata", f"{file}.dta") @@ -270,6 +270,52 @@ def test_read_dta4(self, file, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) + @pytest.mark.parametrize("file", ["stata4_105", "stata4_108"]) + def test_readold_dta4(self, file, datapath): + # This test is the same as test_read_dta4 above except that the columns + # had to be renamed to match the restrictions in older file format + file = datapath("io", "data", "stata", f"{file}.dta") + parsed = self.read_dta(file) + + expected = DataFrame.from_records( + [ + ["one", "ten", "one", "one", "one"], + ["two", "nine", "two", "two", "two"], + ["three", "eight", "three", "three", "three"], + ["four", "seven", 4, "four", "four"], + ["five", "six", 5, np.nan, "five"], + ["six", "five", 6, np.nan, "six"], + ["seven", "four", 7, np.nan, "seven"], + ["eight", "three", 8, np.nan, "eight"], + ["nine", "two", 9, np.nan, "nine"], + ["ten", "one", "ten", np.nan, "ten"], + ], + columns=[ + "fulllab", + "fulllab2", + "incmplab", + "misslab", + "floatlab", + ], + ) + + # these are all categoricals + for col in expected: + orig = expected[col].copy() + + categories = np.asarray(expected["fulllab"][orig.notna()]) + if col == "incmplab": + categories = orig + + cat = orig.astype("category")._values + cat = cat.set_categories(categories, ordered=True) + cat.categories.rename(None, inplace=True) + + expected[col] = cat + + # stata doesn't save .category metadata + tm.assert_frame_equal(parsed, expected) + # File containing strls def test_read_dta12(self, datapath): parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta")) @@ -324,39 +370,39 @@ def test_read_dta18(self, datapath): assert rdr.data_label == "This is a Ünicode data label" - def test_read_write_dta5(self): + def test_read_write_dta5(self, temp_file): original = DataFrame( [(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"], ) original.index.name = "index" - with tm.ensure_clean() as path: - original.to_stata(path, convert_dates=None) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path, convert_dates=None) + written_and_read_again = self.read_dta(path) expected = original expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - def test_write_dta6(self, datapath): + def test_write_dta6(self, datapath, temp_file): original = self.read_csv(datapath("io", "data", "stata", "stata3.csv")) original.index.name = "index" original.index = original.index.astype(np.int32) original["year"] = original["year"].astype(np.int32) original["quarter"] = original["quarter"].astype(np.int32) - with tm.ensure_clean() as path: - original.to_stata(path, convert_dates=None) - written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index("index"), - original, - check_index_type=False, - ) + path = temp_file + original.to_stata(path, convert_dates=None) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + original, + check_index_type=False, + ) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_write_dta10(self, version): + def test_read_write_dta10(self, version, temp_file): original = DataFrame( data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], columns=["string", "object", "integer", "floating", "datetime"], @@ -366,9 +412,9 @@ def test_read_write_dta10(self, version): original.index = original.index.astype(np.int32) original["integer"] = original["integer"].astype(np.int32) - with tm.ensure_clean() as path: - original.to_stata(path, convert_dates={"datetime": "tc"}, version=version) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path, convert_dates={"datetime": "tc"}, version=version) + written_and_read_again = self.read_dta(path) expected = original[:] # "tc" convert_dates means we store in ms @@ -379,14 +425,14 @@ def test_read_write_dta10(self, version): expected, ) - def test_stata_doc_examples(self): - with tm.ensure_clean() as path: - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB") - ) - df.to_stata(path) + def test_stata_doc_examples(self, temp_file): + path = temp_file + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB") + ) + df.to_stata(path) - def test_write_preserves_original(self): + def test_write_preserves_original(self, temp_file): # 9795 df = DataFrame( @@ -394,12 +440,12 @@ def test_write_preserves_original(self): ) df.loc[2, "a":"c"] = np.nan df_copy = df.copy() - with tm.ensure_clean() as path: - df.to_stata(path, write_index=False) + path = temp_file + df.to_stata(path, write_index=False) tm.assert_frame_equal(df, df_copy) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_encoding(self, version, datapath): + def test_encoding(self, version, datapath, temp_file): # GH 4626, proper encoding handling raw = read_stata(datapath("io", "data", "stata", "stata1_encoding.dta")) encoded = read_stata(datapath("io", "data", "stata", "stata1_encoding.dta")) @@ -409,12 +455,12 @@ def test_encoding(self, version, datapath): assert result == expected assert isinstance(result, str) - with tm.ensure_clean() as path: - encoded.to_stata(path, write_index=False, version=version) - reread_encoded = read_stata(path) - tm.assert_frame_equal(encoded, reread_encoded) + path = temp_file + encoded.to_stata(path, write_index=False, version=version) + reread_encoded = read_stata(path) + tm.assert_frame_equal(encoded, reread_encoded) - def test_read_write_dta11(self): + def test_read_write_dta11(self, temp_file): original = DataFrame( [(1, 2, 3, 4)], columns=[ @@ -431,18 +477,18 @@ def test_read_write_dta11(self): formatted.index.name = "index" formatted = formatted.astype(np.int32) - with tm.ensure_clean() as path: - with tm.assert_produces_warning(InvalidColumnName): - original.to_stata(path, convert_dates=None) + path = temp_file + with tm.assert_produces_warning(InvalidColumnName): + original.to_stata(path, convert_dates=None) - written_and_read_again = self.read_dta(path) + written_and_read_again = self.read_dta(path) expected = formatted expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_write_dta12(self, version): + def test_read_write_dta12(self, version, temp_file): original = DataFrame( [(1, 2, 3, 4, 5, 6)], columns=[ @@ -468,18 +514,18 @@ def test_read_write_dta12(self, version): formatted.index.name = "index" formatted = formatted.astype(np.int32) - with tm.ensure_clean() as path: - with tm.assert_produces_warning(InvalidColumnName): - original.to_stata(path, convert_dates=None, version=version) - # should get a warning for that format. + path = temp_file + with tm.assert_produces_warning(InvalidColumnName): + original.to_stata(path, convert_dates=None, version=version) + # should get a warning for that format. - written_and_read_again = self.read_dta(path) + written_and_read_again = self.read_dta(path) expected = formatted expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - def test_read_write_dta13(self): + def test_read_write_dta13(self, temp_file): s1 = Series(2**9, dtype=np.int16) s2 = Series(2**17, dtype=np.int32) s3 = Series(2**33, dtype=np.int64) @@ -489,9 +535,9 @@ def test_read_write_dta13(self): formatted = original formatted["int64"] = formatted["int64"].astype(np.float64) - with tm.ensure_clean() as path: - original.to_stata(path) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path) + written_and_read_again = self.read_dta(path) expected = formatted expected.index = expected.index.astype(np.int32) @@ -501,19 +547,20 @@ def test_read_write_dta13(self): @pytest.mark.parametrize( "file", ["stata5_113", "stata5_114", "stata5_115", "stata5_117"] ) - def test_read_write_reread_dta14(self, file, parsed_114, version, datapath): + def test_read_write_reread_dta14( + self, file, parsed_114, version, datapath, temp_file + ): file = datapath("io", "data", "stata", f"{file}.dta") parsed = self.read_dta(file) parsed.index.name = "index" tm.assert_frame_equal(parsed_114, parsed) - with tm.ensure_clean() as path: - parsed_114.to_stata(path, convert_dates={"date_td": "td"}, version=version) - written_and_read_again = self.read_dta(path) + path = temp_file + parsed_114.to_stata(path, convert_dates={"date_td": "td"}, version=version) + written_and_read_again = self.read_dta(path) expected = parsed_114.copy() - expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) @pytest.mark.parametrize( @@ -537,38 +584,38 @@ def test_read_write_reread_dta15(self, file, datapath): tm.assert_frame_equal(expected, parsed) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_timestamp_and_label(self, version): + def test_timestamp_and_label(self, version, temp_file): original = DataFrame([(1,)], columns=["variable"]) time_stamp = datetime(2000, 2, 29, 14, 21) data_label = "This is a data file." - with tm.ensure_clean() as path: - original.to_stata( - path, time_stamp=time_stamp, data_label=data_label, version=version - ) + path = temp_file + original.to_stata( + path, time_stamp=time_stamp, data_label=data_label, version=version + ) - with StataReader(path) as reader: - assert reader.time_stamp == "29 Feb 2000 14:21" - assert reader.data_label == data_label + with StataReader(path) as reader: + assert reader.time_stamp == "29 Feb 2000 14:21" + assert reader.data_label == data_label @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_invalid_timestamp(self, version): + def test_invalid_timestamp(self, version, temp_file): original = DataFrame([(1,)], columns=["variable"]) time_stamp = "01 Jan 2000, 00:00:00" - with tm.ensure_clean() as path: - msg = "time_stamp should be datetime type" - with pytest.raises(ValueError, match=msg): - original.to_stata(path, time_stamp=time_stamp, version=version) - assert not os.path.isfile(path) + path = temp_file + msg = "time_stamp should be datetime type" + with pytest.raises(ValueError, match=msg): + original.to_stata(path, time_stamp=time_stamp, version=version) + assert not os.path.isfile(path) - def test_numeric_column_names(self): + def test_numeric_column_names(self, temp_file): original = DataFrame(np.reshape(np.arange(25.0), (5, 5))) original.index.name = "index" - with tm.ensure_clean() as path: - # should get a warning for that format. - with tm.assert_produces_warning(InvalidColumnName): - original.to_stata(path) + path = temp_file + # should get a warning for that format. + with tm.assert_produces_warning(InvalidColumnName): + original.to_stata(path) - written_and_read_again = self.read_dta(path) + written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index("index") columns = list(written_and_read_again.columns) @@ -576,11 +623,10 @@ def test_numeric_column_names(self): written_and_read_again.columns = map(convert_col_name, columns) expected = original - expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(expected, written_and_read_again) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_nan_to_missing_value(self, version): + def test_nan_to_missing_value(self, version, temp_file): s1 = Series(np.arange(4.0), dtype=np.float32) s2 = Series(np.arange(4.0), dtype=np.float64) s1[::2] = np.nan @@ -588,66 +634,63 @@ def test_nan_to_missing_value(self, version): original = DataFrame({"s1": s1, "s2": s2}) original.index.name = "index" - with tm.ensure_clean() as path: - original.to_stata(path, version=version) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path, version=version) + written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index("index") expected = original - expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again, expected) - def test_no_index(self): + def test_no_index(self, temp_file): columns = ["x", "y"] original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), columns=columns) original.index.name = "index_not_written" - with tm.ensure_clean() as path: - original.to_stata(path, write_index=False) - written_and_read_again = self.read_dta(path) - with pytest.raises(KeyError, match=original.index.name): - written_and_read_again["index_not_written"] + path = temp_file + original.to_stata(path, write_index=False) + written_and_read_again = self.read_dta(path) + with pytest.raises(KeyError, match=original.index.name): + written_and_read_again["index_not_written"] - def test_string_no_dates(self): + def test_string_no_dates(self, temp_file): s1 = Series(["a", "A longer string"]) s2 = Series([1.0, 2.0], dtype=np.float64) original = DataFrame({"s1": s1, "s2": s2}) original.index.name = "index" - with tm.ensure_clean() as path: - original.to_stata(path) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path) + written_and_read_again = self.read_dta(path) expected = original - expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - def test_large_value_conversion(self): + def test_large_value_conversion(self, temp_file): s0 = Series([1, 99], dtype=np.int8) s1 = Series([1, 127], dtype=np.int8) s2 = Series([1, 2**15 - 1], dtype=np.int16) s3 = Series([1, 2**63 - 1], dtype=np.int64) original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3}) original.index.name = "index" - with tm.ensure_clean() as path: - with tm.assert_produces_warning(PossiblePrecisionLoss): - original.to_stata(path) + path = temp_file + with tm.assert_produces_warning(PossiblePrecisionLoss): + original.to_stata(path) - written_and_read_again = self.read_dta(path) + written_and_read_again = self.read_dta(path) modified = original modified["s1"] = Series(modified["s1"], dtype=np.int16) modified["s2"] = Series(modified["s2"], dtype=np.int32) modified["s3"] = Series(modified["s3"], dtype=np.float64) - modified.index = original.index.astype(np.int32) tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) - def test_dates_invalid_column(self): + def test_dates_invalid_column(self, temp_file): original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) original.index.name = "index" - with tm.ensure_clean() as path: - with tm.assert_produces_warning(InvalidColumnName): - original.to_stata(path, convert_dates={0: "tc"}) + path = temp_file + with tm.assert_produces_warning(InvalidColumnName): + original.to_stata(path, convert_dates={0: "tc"}) - written_and_read_again = self.read_dta(path) + written_and_read_again = self.read_dta(path) expected = original.copy() expected.columns = ["_0"] @@ -678,7 +721,7 @@ def test_value_labels_old_format(self, datapath): with StataReader(dpath) as reader: assert reader.value_labels() == {} - def test_date_export_formats(self): + def test_date_export_formats(self, temp_file): columns = ["tc", "td", "tw", "tm", "tq", "th", "ty"] conversions = {c: c for c in columns} data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns) @@ -702,30 +745,30 @@ def test_date_export_formats(self): ) expected["tc"] = expected["tc"].astype("M8[ms]") - with tm.ensure_clean() as path: - original.to_stata(path, convert_dates=conversions) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path, convert_dates=conversions) + written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - def test_write_missing_strings(self): + def test_write_missing_strings(self, temp_file): original = DataFrame([["1"], [None]], columns=["foo"]) expected = DataFrame( [["1"], [""]], - index=pd.Index([0, 1], dtype=np.int32, name="index"), + index=pd.RangeIndex(2, name="index"), columns=["foo"], ) - with tm.ensure_clean() as path: - original.to_stata(path) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path) + written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize("byteorder", [">", "<"]) - def test_bool_uint(self, byteorder, version): + def test_bool_uint(self, byteorder, version, temp_file): s0 = Series([0, 1, True], dtype=np.bool_) s1 = Series([0, 1, 100], dtype=np.uint8) s2 = Series([0, 1, 255], dtype=np.uint8) @@ -739,14 +782,13 @@ def test_bool_uint(self, byteorder, version): ) original.index.name = "index" - with tm.ensure_clean() as path: - original.to_stata(path, byteorder=byteorder, version=version) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path, byteorder=byteorder, version=version) + written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index("index") expected = original - expected.index = expected.index.astype(np.int32) expected_types = ( np.int8, np.int8, @@ -774,7 +816,7 @@ def test_variable_labels(self, datapath): assert k in keys assert v in labels - def test_minimal_size_col(self): + def test_minimal_size_col(self, temp_file): str_lens = (1, 100, 244) s = {} for str_len in str_lens: @@ -782,16 +824,16 @@ def test_minimal_size_col(self): ["a" * str_len, "b" * str_len, "c" * str_len] ) original = DataFrame(s) - with tm.ensure_clean() as path: - original.to_stata(path, write_index=False) + path = temp_file + original.to_stata(path, write_index=False) - with StataReader(path) as sr: - sr._ensure_open() # The `_*list` variables are initialized here - for variable, fmt, typ in zip(sr._varlist, sr._fmtlist, sr._typlist): - assert int(variable[1:]) == int(fmt[1:-1]) - assert int(variable[1:]) == typ + with StataReader(path) as sr: + sr._ensure_open() # The `_*list` variables are initialized here + for variable, fmt, typ in zip(sr._varlist, sr._fmtlist, sr._typlist): + assert int(variable[1:]) == int(fmt[1:-1]) + assert int(variable[1:]) == typ - def test_excessively_long_string(self): + def test_excessively_long_string(self, temp_file): str_lens = (1, 244, 500) s = {} for str_len in str_lens: @@ -806,16 +848,16 @@ def test_excessively_long_string(self): r"the newer \(Stata 13 and later\) format\." ) with pytest.raises(ValueError, match=msg): - with tm.ensure_clean() as path: - original.to_stata(path) + path = temp_file + original.to_stata(path) - def test_missing_value_generator(self): + def test_missing_value_generator(self, temp_file): types = ("b", "h", "l") df = DataFrame([[0.0]], columns=["float_"]) - with tm.ensure_clean() as path: - df.to_stata(path) - with StataReader(path) as rdr: - valid_range = rdr.VALID_RANGE + path = temp_file + df.to_stata(path) + with StataReader(path) as rdr: + valid_range = rdr.VALID_RANGE expected_values = ["." + chr(97 + i) for i in range(26)] expected_values.insert(0, ".") for t in types: @@ -856,7 +898,7 @@ def test_missing_value_conversion(self, file, datapath): ) tm.assert_frame_equal(parsed, expected) - def test_big_dates(self, datapath): + def test_big_dates(self, datapath, temp_file): yr = [1960, 2000, 9999, 100, 2262, 1677] mo = [1, 1, 12, 1, 4, 9] dd = [1, 1, 31, 1, 22, 23] @@ -912,10 +954,10 @@ def test_big_dates(self, datapath): date_conversion = {c: c[-2:] for c in columns} # {c : c[-2:] for c in columns} - with tm.ensure_clean() as path: - expected.index.name = "index" - expected.to_stata(path, convert_dates=date_conversion) - written_and_read_again = self.read_dta(path) + path = temp_file + expected.index.name = "index" + expected.to_stata(path, convert_dates=date_conversion) + written_and_read_again = self.read_dta(path) tm.assert_frame_equal( written_and_read_again.set_index("index"), @@ -1000,7 +1042,7 @@ def test_drop_column(self, datapath): @pytest.mark.filterwarnings( "ignore:\\nStata value:pandas.io.stata.ValueLabelTypeMismatch" ) - def test_categorical_writing(self, version): + def test_categorical_writing(self, version, temp_file): original = DataFrame.from_records( [ ["one", "ten", "one", "one", "one", 1], @@ -1023,14 +1065,14 @@ def test_categorical_writing(self, version): "unlabeled", ], ) - with tm.ensure_clean() as path: - original.astype("category").to_stata(path, version=version) - written_and_read_again = self.read_dta(path) + path = temp_file + original.astype("category").to_stata(path, version=version) + written_and_read_again = self.read_dta(path) res = written_and_read_again.set_index("index") expected = original - expected.index = expected.index.set_names("index").astype(np.int32) + expected.index = expected.index.set_names("index") expected["incompletely_labeled"] = expected["incompletely_labeled"].apply(str) expected["unlabeled"] = expected["unlabeled"].apply(str) @@ -1048,7 +1090,7 @@ def test_categorical_writing(self, version): tm.assert_frame_equal(res, expected) - def test_categorical_warnings_and_errors(self): + def test_categorical_warnings_and_errors(self, temp_file): # Warning for non-string labels # Error for labels too long original = DataFrame.from_records( @@ -1057,13 +1099,13 @@ def test_categorical_warnings_and_errors(self): ) original = original.astype("category") - with tm.ensure_clean() as path: - msg = ( - "Stata value labels for a single variable must have " - r"a combined length less than 32,000 characters\." - ) - with pytest.raises(ValueError, match=msg): - original.to_stata(path) + path = temp_file + msg = ( + "Stata value labels for a single variable must have " + r"a combined length less than 32,000 characters\." + ) + with pytest.raises(ValueError, match=msg): + original.to_stata(path) original = DataFrame.from_records( [["a"], ["b"], ["c"], ["d"], [1]], columns=["Too_long"] @@ -1074,7 +1116,7 @@ def test_categorical_warnings_and_errors(self): # should get a warning for mixed content @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_categorical_with_stata_missing_values(self, version): + def test_categorical_with_stata_missing_values(self, version, temp_file): values = [["a" + str(i)] for i in range(120)] values.append([np.nan]) original = DataFrame.from_records(values, columns=["many_labels"]) @@ -1082,9 +1124,9 @@ def test_categorical_with_stata_missing_values(self, version): [original[col].astype("category") for col in original], axis=1 ) original.index.name = "index" - with tm.ensure_clean() as path: - original.to_stata(path, version=version) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata(path, version=version) + written_and_read_again = self.read_dta(path) res = written_and_read_again.set_index("index") @@ -1094,7 +1136,6 @@ def test_categorical_with_stata_missing_values(self, version): new_cats = cat.remove_unused_categories().categories cat = cat.set_categories(new_cats, ordered=True) expected[col] = cat - expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(res, expected) @pytest.mark.parametrize("file", ["stata10_115", "stata10_117"]) @@ -1320,54 +1361,50 @@ def test_read_chunks_columns(self, datapath): pos += chunksize @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_write_variable_labels(self, version, mixed_frame): + def test_write_variable_labels(self, version, mixed_frame, temp_file): # GH 13631, add support for writing variable labels mixed_frame.index.name = "index" variable_labels = {"a": "City Rank", "b": "City Exponent", "c": "City"} - with tm.ensure_clean() as path: - mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) - with StataReader(path) as sr: - read_labels = sr.variable_labels() - expected_labels = { - "index": "", - "a": "City Rank", - "b": "City Exponent", - "c": "City", - } - assert read_labels == expected_labels + path = temp_file + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) + with StataReader(path) as sr: + read_labels = sr.variable_labels() + expected_labels = { + "index": "", + "a": "City Rank", + "b": "City Exponent", + "c": "City", + } + assert read_labels == expected_labels variable_labels["index"] = "The Index" - with tm.ensure_clean() as path: - mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) - with StataReader(path) as sr: - read_labels = sr.variable_labels() - assert read_labels == variable_labels + path = temp_file + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) + with StataReader(path) as sr: + read_labels = sr.variable_labels() + assert read_labels == variable_labels @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_invalid_variable_labels(self, version, mixed_frame): + def test_invalid_variable_labels(self, version, mixed_frame, temp_file): mixed_frame.index.name = "index" variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"} - with tm.ensure_clean() as path: - msg = "Variable labels must be 80 characters or fewer" - with pytest.raises(ValueError, match=msg): - mixed_frame.to_stata( - path, variable_labels=variable_labels, version=version - ) + path = temp_file + msg = "Variable labels must be 80 characters or fewer" + with pytest.raises(ValueError, match=msg): + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) @pytest.mark.parametrize("version", [114, 117]) - def test_invalid_variable_label_encoding(self, version, mixed_frame): + def test_invalid_variable_label_encoding(self, version, mixed_frame, temp_file): mixed_frame.index.name = "index" variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"} variable_labels["a"] = "invalid character Œ" - with tm.ensure_clean() as path: - with pytest.raises( - ValueError, match="Variable labels must contain only characters" - ): - mixed_frame.to_stata( - path, variable_labels=variable_labels, version=version - ) + path = temp_file + with pytest.raises( + ValueError, match="Variable labels must contain only characters" + ): + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) - def test_write_variable_label_errors(self, mixed_frame): + def test_write_variable_label_errors(self, mixed_frame, temp_file): values = ["\u03a1", "\u0391", "\u039d", "\u0394", "\u0391", "\u03a3"] variable_labels_utf8 = { @@ -1381,8 +1418,8 @@ def test_write_variable_label_errors(self, mixed_frame): "encoded in Latin-1" ) with pytest.raises(ValueError, match=msg): - with tm.ensure_clean() as path: - mixed_frame.to_stata(path, variable_labels=variable_labels_utf8) + path = temp_file + mixed_frame.to_stata(path, variable_labels=variable_labels_utf8) variable_labels_long = { "a": "City Rank", @@ -1394,10 +1431,10 @@ def test_write_variable_label_errors(self, mixed_frame): msg = "Variable labels must be 80 characters or fewer" with pytest.raises(ValueError, match=msg): - with tm.ensure_clean() as path: - mixed_frame.to_stata(path, variable_labels=variable_labels_long) + path = temp_file + mixed_frame.to_stata(path, variable_labels=variable_labels_long) - def test_default_date_conversion(self): + def test_default_date_conversion(self, temp_file): # GH 12259 dates = [ dt.datetime(1999, 12, 31, 12, 12, 12, 12000), @@ -1416,29 +1453,29 @@ def test_default_date_conversion(self): # "tc" for convert_dates below stores with "ms" resolution expected["dates"] = expected["dates"].astype("M8[ms]") - with tm.ensure_clean() as path: - original.to_stata(path, write_index=False) - reread = read_stata(path, convert_dates=True) - tm.assert_frame_equal(expected, reread) + path = temp_file + original.to_stata(path, write_index=False) + reread = read_stata(path, convert_dates=True) + tm.assert_frame_equal(expected, reread) - original.to_stata(path, write_index=False, convert_dates={"dates": "tc"}) - direct = read_stata(path, convert_dates=True) - tm.assert_frame_equal(reread, direct) + original.to_stata(path, write_index=False, convert_dates={"dates": "tc"}) + direct = read_stata(path, convert_dates=True) + tm.assert_frame_equal(reread, direct) - dates_idx = original.columns.tolist().index("dates") - original.to_stata(path, write_index=False, convert_dates={dates_idx: "tc"}) - direct = read_stata(path, convert_dates=True) - tm.assert_frame_equal(reread, direct) + dates_idx = original.columns.tolist().index("dates") + original.to_stata(path, write_index=False, convert_dates={dates_idx: "tc"}) + direct = read_stata(path, convert_dates=True) + tm.assert_frame_equal(reread, direct) - def test_unsupported_type(self): + def test_unsupported_type(self, temp_file): original = DataFrame({"a": [1 + 2j, 2 + 4j]}) msg = "Data type complex128 not supported" with pytest.raises(NotImplementedError, match=msg): - with tm.ensure_clean() as path: - original.to_stata(path) + path = temp_file + original.to_stata(path) - def test_unsupported_datetype(self): + def test_unsupported_datetype(self, temp_file): dates = [ dt.datetime(1999, 12, 31, 12, 12, 12, 12000), dt.datetime(2012, 12, 21, 12, 21, 12, 21000), @@ -1454,8 +1491,8 @@ def test_unsupported_datetype(self): msg = "Format %tC not implemented" with pytest.raises(NotImplementedError, match=msg): - with tm.ensure_clean() as path: - original.to_stata(path, convert_dates={"dates": "tC"}) + path = temp_file + original.to_stata(path, convert_dates={"dates": "tC"}) dates = pd.date_range("1-1-1990", periods=3, tz="Asia/Hong_Kong") original = DataFrame( @@ -1466,8 +1503,8 @@ def test_unsupported_datetype(self): } ) with pytest.raises(NotImplementedError, match="Data type datetime64"): - with tm.ensure_clean() as path: - original.to_stata(path) + path = temp_file + original.to_stata(path) def test_repeated_column_labels(self, datapath): # GH 13923, 25772 @@ -1503,7 +1540,7 @@ def test_stata_111(self, datapath): original = original[["y", "x", "w", "z"]] tm.assert_frame_equal(original, df) - def test_out_of_range_double(self): + def test_out_of_range_double(self, temp_file): # GH 14618 df = DataFrame( { @@ -1516,10 +1553,10 @@ def test_out_of_range_double(self): r"supported by Stata \(.+\)" ) with pytest.raises(ValueError, match=msg): - with tm.ensure_clean() as path: - df.to_stata(path) + path = temp_file + df.to_stata(path) - def test_out_of_range_float(self): + def test_out_of_range_float(self, temp_file): original = DataFrame( { "ColumnOk": [ @@ -1538,17 +1575,16 @@ def test_out_of_range_float(self): for col in original: original[col] = original[col].astype(np.float32) - with tm.ensure_clean() as path: - original.to_stata(path) - reread = read_stata(path) + path = temp_file + original.to_stata(path) + reread = read_stata(path) original["ColumnTooBig"] = original["ColumnTooBig"].astype(np.float64) expected = original - expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(reread.set_index("index"), expected) @pytest.mark.parametrize("infval", [np.inf, -np.inf]) - def test_inf(self, infval): + def test_inf(self, infval, temp_file): # GH 45350 df = DataFrame({"WithoutInf": [0.0, 1.0], "WithInf": [2.0, infval]}) msg = ( @@ -1556,8 +1592,8 @@ def test_inf(self, infval): "which is outside the range supported by Stata." ) with pytest.raises(ValueError, match=msg): - with tm.ensure_clean() as path: - df.to_stata(path) + path = temp_file + df.to_stata(path) def test_path_pathlib(self): df = DataFrame( @@ -1571,19 +1607,19 @@ def test_path_pathlib(self): tm.assert_frame_equal(df, result) @pytest.mark.parametrize("write_index", [True, False]) - def test_value_labels_iterator(self, write_index): + def test_value_labels_iterator(self, write_index, temp_file): # GH 16923 d = {"A": ["B", "E", "C", "A", "E"]} df = DataFrame(data=d) df["A"] = df["A"].astype("category") - with tm.ensure_clean() as path: - df.to_stata(path, write_index=write_index) + path = temp_file + df.to_stata(path, write_index=write_index) - with read_stata(path, iterator=True) as dta_iter: - value_labels = dta_iter.value_labels() + with read_stata(path, iterator=True) as dta_iter: + value_labels = dta_iter.value_labels() assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}} - def test_set_index(self): + def test_set_index(self, temp_file): # GH 17328 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -1591,9 +1627,9 @@ def test_set_index(self): index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), ) df.index.name = "index" - with tm.ensure_clean() as path: - df.to_stata(path) - reread = read_stata(path, index_col="index") + path = temp_file + df.to_stata(path) + reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) @pytest.mark.parametrize( @@ -1616,7 +1652,7 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted - def test_writer_117(self): + def test_writer_117(self, temp_file): original = DataFrame( data=[ [ @@ -1669,16 +1705,15 @@ def test_writer_117(self): original["int32"] = original["int32"].astype(np.int32) original["float32"] = Series(original["float32"], dtype=np.float32) original.index.name = "index" - original.index = original.index.astype(np.int32) copy = original.copy() - with tm.ensure_clean() as path: - original.to_stata( - path, - convert_dates={"datetime": "tc"}, - convert_strl=["forced_strl"], - version=117, - ) - written_and_read_again = self.read_dta(path) + path = temp_file + original.to_stata( + path, + convert_dates={"datetime": "tc"}, + convert_strl=["forced_strl"], + version=117, + ) + written_and_read_again = self.read_dta(path) expected = original[:] # "tc" for convert_dates means we store with "ms" resolution @@ -1690,7 +1725,7 @@ def test_writer_117(self): ) tm.assert_frame_equal(original, copy) - def test_convert_strl_name_swap(self): + def test_convert_strl_name_swap(self, temp_file): original = DataFrame( [["a" * 3000, "A", "apple"], ["b" * 1000, "B", "banana"]], columns=["long1" * 10, "long", 1], @@ -1698,14 +1733,14 @@ def test_convert_strl_name_swap(self): original.index.name = "index" with tm.assert_produces_warning(InvalidColumnName): - with tm.ensure_clean() as path: - original.to_stata(path, convert_strl=["long", 1], version=117) - reread = self.read_dta(path) - reread = reread.set_index("index") - reread.columns = original.columns - tm.assert_frame_equal(reread, original, check_index_type=False) - - def test_invalid_date_conversion(self): + path = temp_file + original.to_stata(path, convert_strl=["long", 1], version=117) + reread = self.read_dta(path) + reread = reread.set_index("index") + reread.columns = original.columns + tm.assert_frame_equal(reread, original, check_index_type=False) + + def test_invalid_date_conversion(self, temp_file): # GH 12259 dates = [ dt.datetime(1999, 12, 31, 12, 12, 12, 12000), @@ -1720,13 +1755,13 @@ def test_invalid_date_conversion(self): } ) - with tm.ensure_clean() as path: - msg = "convert_dates key must be a column or an integer" - with pytest.raises(ValueError, match=msg): - original.to_stata(path, convert_dates={"wrong_name": "tc"}) + path = temp_file + msg = "convert_dates key must be a column or an integer" + with pytest.raises(ValueError, match=msg): + original.to_stata(path, convert_dates={"wrong_name": "tc"}) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_nonfile_writing(self, version): + def test_nonfile_writing(self, version, temp_file): # GH 21041 bio = io.BytesIO() df = DataFrame( @@ -1735,15 +1770,15 @@ def test_nonfile_writing(self, version): index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), ) df.index.name = "index" - with tm.ensure_clean() as path: - df.to_stata(bio, version=version) - bio.seek(0) - with open(path, "wb") as dta: - dta.write(bio.read()) - reread = read_stata(path, index_col="index") + path = temp_file + df.to_stata(bio, version=version) + bio.seek(0) + with open(path, "wb") as dta: + dta.write(bio.read()) + reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) - def test_gzip_writing(self): + def test_gzip_writing(self, temp_file): # writing version 117 requires seek and cannot be used with gzip df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -1751,11 +1786,11 @@ def test_gzip_writing(self): index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), ) df.index.name = "index" - with tm.ensure_clean() as path: - with gzip.GzipFile(path, "wb") as gz: - df.to_stata(gz, version=114) - with gzip.GzipFile(path, "rb") as gz: - reread = read_stata(gz, index_col="index") + path = temp_file + with gzip.GzipFile(path, "wb") as gz: + df.to_stata(gz, version=114) + with gzip.GzipFile(path, "rb") as gz: + reread = read_stata(gz, index_col="index") tm.assert_frame_equal(df, reread) def test_unicode_dta_118(self, datapath): @@ -1775,70 +1810,65 @@ def test_unicode_dta_118(self, datapath): tm.assert_frame_equal(unicode_df, expected) - def test_mixed_string_strl(self): + def test_mixed_string_strl(self, temp_file): # GH 23633 output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}] output = DataFrame(output) output.number = output.number.astype("int32") - with tm.ensure_clean() as path: - output.to_stata(path, write_index=False, version=117) - reread = read_stata(path) - expected = output.fillna("") - tm.assert_frame_equal(reread, expected) + path = temp_file + output.to_stata(path, write_index=False, version=117) + reread = read_stata(path) + expected = output.fillna("") + tm.assert_frame_equal(reread, expected) - # Check strl supports all None (null) - output["mixed"] = None - output.to_stata( - path, write_index=False, convert_strl=["mixed"], version=117 - ) - reread = read_stata(path) - expected = output.fillna("") - tm.assert_frame_equal(reread, expected) + # Check strl supports all None (null) + output["mixed"] = None + output.to_stata(path, write_index=False, convert_strl=["mixed"], version=117) + reread = read_stata(path) + expected = output.fillna("") + tm.assert_frame_equal(reread, expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_all_none_exception(self, version): + def test_all_none_exception(self, version, temp_file): output = [{"none": "none", "number": 0}, {"none": None, "number": 1}] output = DataFrame(output) output["none"] = None - with tm.ensure_clean() as path: - with pytest.raises(ValueError, match="Column `none` cannot be exported"): - output.to_stata(path, version=version) + with pytest.raises(ValueError, match="Column `none` cannot be exported"): + output.to_stata(temp_file, version=version) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_invalid_file_not_written(self, version): + def test_invalid_file_not_written(self, version, temp_file): content = "Here is one __�__ Another one __·__ Another one __½__" df = DataFrame([content], columns=["invalid"]) - with tm.ensure_clean() as path: - msg1 = ( - r"'latin-1' codec can't encode character '\\ufffd' " - r"in position 14: ordinal not in range\(256\)" - ) - msg2 = ( - "'ascii' codec can't decode byte 0xef in position 14: " - r"ordinal not in range\(128\)" - ) - with pytest.raises(UnicodeEncodeError, match=f"{msg1}|{msg2}"): - df.to_stata(path) + msg1 = ( + r"'latin-1' codec can't encode character '\\ufffd' " + r"in position 14: ordinal not in range\(256\)" + ) + msg2 = ( + "'ascii' codec can't decode byte 0xef in position 14: " + r"ordinal not in range\(128\)" + ) + with pytest.raises(UnicodeEncodeError, match=f"{msg1}|{msg2}"): + df.to_stata(temp_file) - def test_strl_latin1(self): + def test_strl_latin1(self, temp_file): # GH 23573, correct GSO data to reflect correct size output = DataFrame( [["pandas"] * 2, ["þâÑÐŧ"] * 2], columns=["var_str", "var_strl"] ) - with tm.ensure_clean() as path: - output.to_stata(path, version=117, convert_strl=["var_strl"]) - with open(path, "rb") as reread: - content = reread.read() - expected = "þâÑÐŧ" - assert expected.encode("latin-1") in content - assert expected.encode("utf-8") in content - gsos = content.split(b"strls")[1][1:-2] - for gso in gsos.split(b"GSO")[1:]: - val = gso.split(b"\x00")[-2] - size = gso[gso.find(b"\x82") + 1] - assert len(val) == size - 1 + output.to_stata(temp_file, version=117, convert_strl=["var_strl"]) + with open(temp_file, "rb") as reread: + content = reread.read() + expected = "þâÑÐŧ" + assert expected.encode("latin-1") in content + assert expected.encode("utf-8") in content + gsos = content.split(b"strls")[1][1:-2] + for gso in gsos.split(b"GSO")[1:]: + val = gso.split(b"\x00")[-2] + size = gso[gso.find(b"\x82") + 1] + assert len(val) == size - 1 def test_encoding_latin1_118(self, datapath): # GH 25960 @@ -1873,7 +1903,7 @@ def test_stata_119(self, datapath): assert reader._nvar == 32999 @pytest.mark.parametrize("version", [118, 119, None]) - def test_utf8_writer(self, version): + def test_utf8_writer(self, version, temp_file): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) data = DataFrame( [ @@ -1894,48 +1924,45 @@ def test_utf8_writer(self, version): data_label = "ᴅaᵀa-label" value_labels = {"β": {1: "label", 2: "æøå", 3: "ŋot valid latin-1"}} data["β"] = data["β"].astype(np.int32) - with tm.ensure_clean() as path: - writer = StataWriterUTF8( - path, - data, - data_label=data_label, - convert_strl=["strls"], - variable_labels=variable_labels, - write_index=False, - version=version, - value_labels=value_labels, - ) - writer.write_file() - reread_encoded = read_stata(path) - # Missing is intentionally converted to empty strl - data["strls"] = data["strls"].fillna("") - # Variable with value labels is reread as categorical - data["β"] = ( - data["β"].replace(value_labels["β"]).astype("category").cat.as_ordered() - ) - tm.assert_frame_equal(data, reread_encoded) - with StataReader(path) as reader: - assert reader.data_label == data_label - assert reader.variable_labels() == variable_labels + writer = StataWriterUTF8( + temp_file, + data, + data_label=data_label, + convert_strl=["strls"], + variable_labels=variable_labels, + write_index=False, + version=version, + value_labels=value_labels, + ) + writer.write_file() + reread_encoded = read_stata(temp_file) + # Missing is intentionally converted to empty strl + data["strls"] = data["strls"].fillna("") + # Variable with value labels is reread as categorical + data["β"] = ( + data["β"].replace(value_labels["β"]).astype("category").cat.as_ordered() + ) + tm.assert_frame_equal(data, reread_encoded) + with StataReader(temp_file) as reader: + assert reader.data_label == data_label + assert reader.variable_labels() == variable_labels - data.to_stata(path, version=version, write_index=False) - reread_to_stata = read_stata(path) - tm.assert_frame_equal(data, reread_to_stata) + data.to_stata(temp_file, version=version, write_index=False) + reread_to_stata = read_stata(temp_file) + tm.assert_frame_equal(data, reread_to_stata) - def test_writer_118_exceptions(self): + def test_writer_118_exceptions(self, temp_file): df = DataFrame(np.zeros((1, 33000), dtype=np.int8)) - with tm.ensure_clean() as path: - with pytest.raises(ValueError, match="version must be either 118 or 119."): - StataWriterUTF8(path, df, version=117) - with tm.ensure_clean() as path: - with pytest.raises(ValueError, match="You must use version 119"): - StataWriterUTF8(path, df, version=118) + with pytest.raises(ValueError, match="version must be either 118 or 119."): + StataWriterUTF8(temp_file, df, version=117) + with pytest.raises(ValueError, match="You must use version 119"): + StataWriterUTF8(temp_file, df, version=118) @pytest.mark.parametrize( "dtype_backend", ["numpy_nullable", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], ) - def test_read_write_ea_dtypes(self, dtype_backend): + def test_read_write_ea_dtypes(self, dtype_backend, temp_file): df = DataFrame( { "a": [1, 2, None], @@ -1949,9 +1976,8 @@ def test_read_write_ea_dtypes(self, dtype_backend): df = df.convert_dtypes(dtype_backend=dtype_backend) df.to_stata("test_stata.dta", version=118) - with tm.ensure_clean() as path: - df.to_stata(path) - written_and_read_again = self.read_dta(path) + df.to_stata(temp_file) + written_and_read_again = self.read_dta(temp_file) expected = DataFrame( { @@ -1962,7 +1988,7 @@ def test_read_write_ea_dtypes(self, dtype_backend): # stata stores with ms unit, so unit does not round-trip exactly "e": pd.date_range("2020-12-31", periods=3, freq="D", unit="ms"), }, - index=pd.Index([0, 1, 2], name="index", dtype=np.int32), + index=pd.RangeIndex(range(3), name="index"), ) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) @@ -1978,6 +2004,15 @@ def test_backward_compat(version, datapath): tm.assert_frame_equal(old_dta, expected, check_dtype=False) +@pytest.mark.parametrize("version", [105, 108, 111, 113, 114, 118]) +def test_bigendian(version, datapath): + ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta") + expected = read_stata(ref) + big_dta = read_stata(big) + tm.assert_frame_equal(big_dta, expected) + + def test_direct_read(datapath, monkeypatch): file_path = datapath("io", "data", "stata", "stata-compat-118.dta") @@ -2004,7 +2039,9 @@ def test_direct_read(datapath, monkeypatch): @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize("use_dict", [True, False]) @pytest.mark.parametrize("infer", [True, False]) -def test_compression(compression, version, use_dict, infer, compression_to_extension): +def test_compression( + compression, version, use_dict, infer, compression_to_extension, tmp_path +): file_name = "dta_inferred_compression.dta" if compression: if use_dict: @@ -2022,80 +2059,80 @@ def test_compression(compression, version, use_dict, infer, compression_to_exten np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB") ) df.index.name = "index" - with tm.ensure_clean(file_name) as path: - df.to_stata(path, version=version, compression=compression_arg) - if compression == "gzip": - with gzip.open(path, "rb") as comp: - fp = io.BytesIO(comp.read()) - elif compression == "zip": - with zipfile.ZipFile(path, "r") as comp: - fp = io.BytesIO(comp.read(comp.filelist[0])) - elif compression == "tar": - with tarfile.open(path) as tar: - fp = io.BytesIO(tar.extractfile(tar.getnames()[0]).read()) - elif compression == "bz2": - with bz2.open(path, "rb") as comp: - fp = io.BytesIO(comp.read()) - elif compression == "zstd": - zstd = pytest.importorskip("zstandard") - with zstd.open(path, "rb") as comp: - fp = io.BytesIO(comp.read()) - elif compression == "xz": - lzma = pytest.importorskip("lzma") - with lzma.open(path, "rb") as comp: - fp = io.BytesIO(comp.read()) - elif compression is None: - fp = path - reread = read_stata(fp, index_col="index") + path = tmp_path / file_name + path.touch() + df.to_stata(path, version=version, compression=compression_arg) + if compression == "gzip": + with gzip.open(path, "rb") as comp: + fp = io.BytesIO(comp.read()) + elif compression == "zip": + with zipfile.ZipFile(path, "r") as comp: + fp = io.BytesIO(comp.read(comp.filelist[0])) + elif compression == "tar": + with tarfile.open(path) as tar: + fp = io.BytesIO(tar.extractfile(tar.getnames()[0]).read()) + elif compression == "bz2": + with bz2.open(path, "rb") as comp: + fp = io.BytesIO(comp.read()) + elif compression == "zstd": + zstd = pytest.importorskip("zstandard") + with zstd.open(path, "rb") as comp: + fp = io.BytesIO(comp.read()) + elif compression == "xz": + lzma = pytest.importorskip("lzma") + with lzma.open(path, "rb") as comp: + fp = io.BytesIO(comp.read()) + elif compression is None: + fp = path + reread = read_stata(fp, index_col="index") expected = df - expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(reread, expected) @pytest.mark.parametrize("method", ["zip", "infer"]) @pytest.mark.parametrize("file_ext", [None, "dta", "zip"]) -def test_compression_dict(method, file_ext): +def test_compression_dict(method, file_ext, tmp_path): file_name = f"test.{file_ext}" archive_name = "test.dta" df = DataFrame( np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB") ) df.index.name = "index" - with tm.ensure_clean(file_name) as path: - compression = {"method": method, "archive_name": archive_name} - df.to_stata(path, compression=compression) - if method == "zip" or file_ext == "zip": - with zipfile.ZipFile(path, "r") as zp: - assert len(zp.filelist) == 1 - assert zp.filelist[0].filename == archive_name - fp = io.BytesIO(zp.read(zp.filelist[0])) - else: - fp = path - reread = read_stata(fp, index_col="index") + compression = {"method": method, "archive_name": archive_name} + path = tmp_path / file_name + path.touch() + df.to_stata(path, compression=compression) + if method == "zip" or file_ext == "zip": + with zipfile.ZipFile(path, "r") as zp: + assert len(zp.filelist) == 1 + assert zp.filelist[0].filename == archive_name + fp = io.BytesIO(zp.read(zp.filelist[0])) + else: + fp = path + reread = read_stata(fp, index_col="index") expected = df - expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(reread, expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) -def test_chunked_categorical(version): +def test_chunked_categorical(version, temp_file): df = DataFrame({"cats": Series(["a", "b", "a", "b", "c"], dtype="category")}) df.index.name = "index" expected = df.copy() - expected.index = expected.index.astype(np.int32) - with tm.ensure_clean() as path: - df.to_stata(path, version=version) - with StataReader(path, chunksize=2, order_categoricals=False) as reader: - for i, block in enumerate(reader): - block = block.set_index("index") - assert "cats" in block - tm.assert_series_equal( - block.cats, expected.cats.iloc[2 * i : 2 * (i + 1)] - ) + df.to_stata(temp_file, version=version) + with StataReader(temp_file, chunksize=2, order_categoricals=False) as reader: + for i, block in enumerate(reader): + block = block.set_index("index") + assert "cats" in block + tm.assert_series_equal( + block.cats, + expected.cats.iloc[2 * i : 2 * (i + 1)], + check_index_type=len(block) > 1, + ) def test_chunked_categorical_partial(datapath): @@ -2125,38 +2162,36 @@ def test_iterator_errors(datapath, chunksize): pass -def test_iterator_value_labels(): +def test_iterator_value_labels(temp_file): # GH 31544 values = ["c_label", "b_label"] + ["a_label"] * 500 df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)}) - with tm.ensure_clean() as path: - df.to_stata(path, write_index=False) - expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") - with read_stata(path, chunksize=100) as reader: - for j, chunk in enumerate(reader): - for i in range(2): - tm.assert_index_equal(chunk.dtypes.iloc[i].categories, expected) - tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100]) + df.to_stata(temp_file, write_index=False) + expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") + with read_stata(temp_file, chunksize=100) as reader: + for j, chunk in enumerate(reader): + for i in range(2): + tm.assert_index_equal(chunk.dtypes.iloc[i].categories, expected) + tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100]) -def test_precision_loss(): +def test_precision_loss(temp_file): df = DataFrame( [[sum(2**i for i in range(60)), sum(2**i for i in range(52))]], columns=["big", "little"], ) - with tm.ensure_clean() as path: - with tm.assert_produces_warning( - PossiblePrecisionLoss, match="Column converted from int64 to float64" - ): - df.to_stata(path, write_index=False) - reread = read_stata(path) - expected_dt = Series([np.float64, np.float64], index=["big", "little"]) - tm.assert_series_equal(reread.dtypes, expected_dt) - assert reread.loc[0, "little"] == df.loc[0, "little"] - assert reread.loc[0, "big"] == float(df.loc[0, "big"]) + with tm.assert_produces_warning( + PossiblePrecisionLoss, match="Column converted from int64 to float64" + ): + df.to_stata(temp_file, write_index=False) + reread = read_stata(temp_file) + expected_dt = Series([np.float64, np.float64], index=["big", "little"]) + tm.assert_series_equal(reread.dtypes, expected_dt) + assert reread.loc[0, "little"] == df.loc[0, "little"] + assert reread.loc[0, "big"] == float(df.loc[0, "big"]) -def test_compression_roundtrip(compression): +def test_compression_roundtrip(compression, temp_file): df = DataFrame( [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=["A", "B"], @@ -2164,22 +2199,21 @@ def test_compression_roundtrip(compression): ) df.index.name = "index" - with tm.ensure_clean() as path: - df.to_stata(path, compression=compression) - reread = read_stata(path, compression=compression, index_col="index") - tm.assert_frame_equal(df, reread) + df.to_stata(temp_file, compression=compression) + reread = read_stata(temp_file, compression=compression, index_col="index") + tm.assert_frame_equal(df, reread) - # explicitly ensure file was compressed. - with tm.decompress_file(path, compression) as fh: - contents = io.BytesIO(fh.read()) - reread = read_stata(contents, index_col="index") - tm.assert_frame_equal(df, reread) + # explicitly ensure file was compressed. + with tm.decompress_file(temp_file, compression) as fh: + contents = io.BytesIO(fh.read()) + reread = read_stata(contents, index_col="index") + tm.assert_frame_equal(df, reread) @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) def test_stata_compression( - compression_only, read_infer, to_infer, compression_to_extension + compression_only, read_infer, to_infer, compression_to_extension, tmp_path ): compression = compression_only @@ -2196,13 +2230,14 @@ def test_stata_compression( to_compression = "infer" if to_infer else compression read_compression = "infer" if read_infer else compression - with tm.ensure_clean(filename) as path: - df.to_stata(path, compression=to_compression) - result = read_stata(path, compression=read_compression, index_col="index") - tm.assert_frame_equal(result, df) + path = tmp_path / filename + path.touch() + df.to_stata(path, compression=to_compression) + result = read_stata(path, compression=read_compression, index_col="index") + tm.assert_frame_equal(result, df) -def test_non_categorical_value_labels(): +def test_non_categorical_value_labels(temp_file): data = DataFrame( { "fully_labelled": [1, 2, 3, 3, 1], @@ -2212,35 +2247,35 @@ def test_non_categorical_value_labels(): } ) - with tm.ensure_clean() as path: - value_labels = { - "fully_labelled": {1: "one", 2: "two", 3: "three"}, - "partially_labelled": {1.0: "one", 2.0: "two"}, - } - expected = {**value_labels, "Z": {0: "j", 1: "k", 2: "l"}} + path = temp_file + value_labels = { + "fully_labelled": {1: "one", 2: "two", 3: "three"}, + "partially_labelled": {1.0: "one", 2.0: "two"}, + } + expected = {**value_labels, "Z": {0: "j", 1: "k", 2: "l"}} - writer = StataWriter(path, data, value_labels=value_labels) - writer.write_file() + writer = StataWriter(path, data, value_labels=value_labels) + writer.write_file() - with StataReader(path) as reader: - reader_value_labels = reader.value_labels() - assert reader_value_labels == expected + with StataReader(path) as reader: + reader_value_labels = reader.value_labels() + assert reader_value_labels == expected - msg = "Can't create value labels for notY, it wasn't found in the dataset." - value_labels = {"notY": {7: "label1", 8: "label2"}} - with pytest.raises(KeyError, match=msg): - StataWriter(path, data, value_labels=value_labels) + msg = "Can't create value labels for notY, it wasn't found in the dataset." + value_labels = {"notY": {7: "label1", 8: "label2"}} + with pytest.raises(KeyError, match=msg): + StataWriter(path, data, value_labels=value_labels) - msg = ( - "Can't create value labels for Z, value labels " - "can only be applied to numeric columns." - ) - value_labels = {"Z": {1: "a", 2: "k", 3: "j", 4: "i"}} - with pytest.raises(ValueError, match=msg): - StataWriter(path, data, value_labels=value_labels) + msg = ( + "Can't create value labels for Z, value labels " + "can only be applied to numeric columns." + ) + value_labels = {"Z": {1: "a", 2: "k", 3: "j", 4: "i"}} + with pytest.raises(ValueError, match=msg): + StataWriter(path, data, value_labels=value_labels) -def test_non_categorical_value_label_name_conversion(): +def test_non_categorical_value_label_name_conversion(temp_file): # Check conversion of invalid variable names data = DataFrame( { @@ -2268,16 +2303,15 @@ def test_non_categorical_value_label_name_conversion(): "_1__2_": {3: "three"}, } - with tm.ensure_clean() as path: - with tm.assert_produces_warning(InvalidColumnName): - data.to_stata(path, value_labels=value_labels) + with tm.assert_produces_warning(InvalidColumnName): + data.to_stata(temp_file, value_labels=value_labels) - with StataReader(path) as reader: - reader_value_labels = reader.value_labels() - assert reader_value_labels == expected + with StataReader(temp_file) as reader: + reader_value_labels = reader.value_labels() + assert reader_value_labels == expected -def test_non_categorical_value_label_convert_categoricals_error(): +def test_non_categorical_value_label_convert_categoricals_error(temp_file): # Mapping more than one value to the same label is valid for Stata # labels, but can't be read with convert_categoricals=True value_labels = { @@ -2290,17 +2324,16 @@ def test_non_categorical_value_label_convert_categoricals_error(): } ) - with tm.ensure_clean() as path: - data.to_stata(path, value_labels=value_labels) + data.to_stata(temp_file, value_labels=value_labels) - with StataReader(path, convert_categoricals=False) as reader: - reader_value_labels = reader.value_labels() - assert reader_value_labels == value_labels + with StataReader(temp_file, convert_categoricals=False) as reader: + reader_value_labels = reader.value_labels() + assert reader_value_labels == value_labels - col = "repeated_labels" - repeats = "-" * 80 + "\n" + "\n".join(["More than ten"]) + col = "repeated_labels" + repeats = "-" * 80 + "\n" + "\n".join(["More than ten"]) - msg = f""" + msg = f""" Value labels for column {col} are not unique. These cannot be converted to pandas categoricals. @@ -2311,8 +2344,8 @@ def test_non_categorical_value_label_convert_categoricals_error(): The repeated labels are: {repeats} """ - with pytest.raises(ValueError, match=msg): - read_stata(path, convert_categoricals=True) + with pytest.raises(ValueError, match=msg): + read_stata(temp_file, convert_categoricals=True) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @@ -2330,7 +2363,7 @@ def test_non_categorical_value_label_convert_categoricals_error(): pd.UInt64Dtype, ], ) -def test_nullable_support(dtype, version): +def test_nullable_support(dtype, version, temp_file): df = DataFrame( { "a": Series([1.0, 2.0, 3.0]), @@ -2349,27 +2382,26 @@ def test_nullable_support(dtype, version): smv = StataMissingValue(value) expected_b = Series([1, smv, smv], dtype=object, name="b") expected_c = Series(["a", "b", ""], name="c") - with tm.ensure_clean() as path: - df.to_stata(path, write_index=False, version=version) - reread = read_stata(path, convert_missing=True) - tm.assert_series_equal(df.a, reread.a) - tm.assert_series_equal(reread.b, expected_b) - tm.assert_series_equal(reread.c, expected_c) + df.to_stata(temp_file, write_index=False, version=version) + reread = read_stata(temp_file, convert_missing=True) + tm.assert_series_equal(df.a, reread.a) + tm.assert_series_equal(reread.b, expected_b) + tm.assert_series_equal(reread.c, expected_c) -def test_empty_frame(): +def test_empty_frame(temp_file): # GH 46240 # create an empty DataFrame with int64 and float64 dtypes df = DataFrame(data={"a": range(3), "b": [1.0, 2.0, 3.0]}).head(0) - with tm.ensure_clean() as path: - df.to_stata(path, write_index=False, version=117) - # Read entire dataframe - df2 = read_stata(path) - assert "b" in df2 - # Dtypes don't match since no support for int32 - dtypes = Series({"a": np.dtype("int32"), "b": np.dtype("float64")}) - tm.assert_series_equal(df2.dtypes, dtypes) - # read one column of empty .dta file - df3 = read_stata(path, columns=["a"]) - assert "b" not in df3 - tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]]) + path = temp_file + df.to_stata(path, write_index=False, version=117) + # Read entire dataframe + df2 = read_stata(path) + assert "b" in df2 + # Dtypes don't match since no support for int32 + dtypes = Series({"a": np.dtype("int32"), "b": np.dtype("float64")}) + tm.assert_series_equal(df2.dtypes, dtypes) + # read one column of empty .dta file + df3 = read_stata(path, columns=["a"]) + assert "b" not in df3 + tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]]) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 2dd45a9abc7a5..f8029a1c1ee40 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -740,3 +740,17 @@ def test_boxplot_multiindex_column(self): expected_xticklabel = ["(bar, one)", "(bar, two)"] result_xticklabel = [x.get_text() for x in axes.get_xticklabels()] assert expected_xticklabel == result_xticklabel + + @pytest.mark.parametrize("group", ["X", ["X", "Y"]]) + def test_boxplot_multi_groupby_groups(self, group): + # GH 14701 + rows = 20 + df = DataFrame( + np.random.default_rng(12).normal(size=(rows, 2)), columns=["Col1", "Col2"] + ) + df["X"] = Series(np.repeat(["A", "B"], int(rows / 2))) + df["Y"] = Series(np.tile(["C", "D"], int(rows / 2))) + grouped = df.groupby(group) + _check_plot_works(df.boxplot, by=group, default_axes=True) + _check_plot_works(df.plot.box, by=group, default_axes=True) + _check_plot_works(grouped.boxplot, default_axes=True) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 048553330c1ce..46753b668a8b0 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -171,9 +171,9 @@ def test_argminmax(self): obj.argmin() with pytest.raises(ValueError, match="Encountered all NA values"): obj.argmax() - with pytest.raises(ValueError, match="Encountered all NA values"): + with pytest.raises(ValueError, match="Encountered an NA value"): obj.argmin(skipna=False) - with pytest.raises(ValueError, match="Encountered all NA values"): + with pytest.raises(ValueError, match="Encountered an NA value"): obj.argmax(skipna=False) obj = Index([NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), NaT]) @@ -189,9 +189,9 @@ def test_argminmax(self): obj.argmin() with pytest.raises(ValueError, match="Encountered all NA values"): obj.argmax() - with pytest.raises(ValueError, match="Encountered all NA values"): + with pytest.raises(ValueError, match="Encountered an NA value"): obj.argmin(skipna=False) - with pytest.raises(ValueError, match="Encountered all NA values"): + with pytest.raises(ValueError, match="Encountered an NA value"): obj.argmax(skipna=False) @pytest.mark.parametrize("op, expected_col", [["max", "a"], ["min", "b"]]) @@ -378,7 +378,7 @@ def test_invalid_td64_reductions(self, opname): [ f"reduction operation '{opname}' not allowed for this dtype", rf"cannot perform {opname} with type timedelta64\[ns\]", - f"does not support reduction '{opname}'", + f"does not support operation '{opname}'", ] ) @@ -665,7 +665,7 @@ def test_empty(self, method, unit, use_bottleneck, dtype): # GH#844 (changed in GH#9422) df = DataFrame(np.empty((10, 0)), dtype=dtype) - assert (getattr(df, method)(1) == unit).all() + assert (getattr(df, method)(axis=1) == unit).all() s = Series([1], dtype=dtype) result = getattr(s, method)(min_count=2) @@ -714,7 +714,7 @@ def test_ops_consistency_on_empty(self, method): [ "operation 'var' not allowed", r"cannot perform var with type timedelta64\[ns\]", - "does not support reduction 'var'", + "does not support operation 'var'", ] ) with pytest.raises(TypeError, match=msg): @@ -856,7 +856,8 @@ def test_idxmin(self): # all NaNs allna = string_series * np.nan - with pytest.raises(ValueError, match="Encountered all NA values"): + msg = "Encountered all NA values" + with pytest.raises(ValueError, match=msg): allna.idxmin() # datetime64[ns] @@ -888,7 +889,8 @@ def test_idxmax(self): # all NaNs allna = string_series * np.nan - with pytest.raises(ValueError, match="Encountered all NA values"): + msg = "Encountered all NA values" + with pytest.raises(ValueError, match=msg): allna.idxmax() s = Series(date_range("20130102", periods=6)) @@ -1010,7 +1012,7 @@ def test_any_all_datetimelike(self): df = DataFrame(ser) # GH#34479 - msg = "datetime64 type does not support operation: '(any|all)'" + msg = "datetime64 type does not support operation '(any|all)'" with pytest.raises(TypeError, match=msg): dta.all() with pytest.raises(TypeError, match=msg): @@ -1155,12 +1157,12 @@ def test_idxminmax_object_dtype(self, using_infer_string): msg = "'>' not supported between instances of 'float' and 'str'" with pytest.raises(TypeError, match=msg): ser3.idxmax() - with pytest.raises(ValueError, match="Encountered an NA value"): + with pytest.raises(TypeError, match=msg): ser3.idxmax(skipna=False) msg = "'<' not supported between instances of 'float' and 'str'" with pytest.raises(TypeError, match=msg): ser3.idxmin() - with pytest.raises(ValueError, match="Encountered an NA value"): + with pytest.raises(TypeError, match=msg): ser3.idxmin(skipna=False) def test_idxminmax_object_frame(self): diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 60fcf8cbc142c..4af1ca1d4800a 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -99,7 +99,7 @@ def _check_stat_op( # mean, idxmax, idxmin, min, and max are valid for dates if name not in ["max", "min", "mean", "median", "std"]: ds = Series(date_range("1/1/2001", periods=10)) - msg = f"does not support reduction '{name}'" + msg = f"does not support operation '{name}'" with pytest.raises(TypeError, match=msg): f(ds) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 9b442fa7dbd07..a77097fd5ce61 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -709,7 +709,7 @@ def test_selection_api_validation(): exp.index.name = "d" with pytest.raises( - TypeError, match="datetime64 type does not support operation: 'sum'" + TypeError, match="datetime64 type does not support operation 'sum'" ): df.resample("2D", level="d").sum() result = df.resample("2D", level="d").sum(numeric_only=True) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index b986aa8182219..2f9fd1eb421d4 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -125,7 +125,7 @@ def test_concat_keys_specific_levels(self): tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key")) tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3])) - assert result.columns.names == ("group_key", None) + assert result.columns.names == ["group_key", None] @pytest.mark.parametrize("mapping", ["mapping", "dict"]) def test_concat_mapping(self, mapping, non_dict_mapping_subclass): @@ -912,3 +912,11 @@ def test_concat_none_with_timezone_timestamp(): result = concat([df1, df2], ignore_index=True) expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]}) tm.assert_frame_equal(result, expected) + + +def test_concat_with_series_and_frame_returns_rangeindex_columns(): + ser = Series([0]) + df = DataFrame([1, 2]) + result = concat([ser, df]) + expected = DataFrame([0, 1, 2], index=[0, 0, 1]) + tm.assert_frame_equal(result, expected, check_column_type=True) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 1cd52ab1ae8b4..1a764cb505ead 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2192,23 +2192,28 @@ def test_merge_on_indexes(self, how, sort, expected): @pytest.mark.parametrize( "index", - [Index([1, 2], dtype=dtyp, name="index_col") for dtyp in tm.ALL_REAL_NUMPY_DTYPES] + [ + Index([1, 2, 4], dtype=dtyp, name="index_col") + for dtyp in tm.ALL_REAL_NUMPY_DTYPES + ] + [ - CategoricalIndex(["A", "B"], categories=["A", "B"], name="index_col"), - RangeIndex(start=0, stop=2, name="index_col"), - DatetimeIndex(["2018-01-01", "2018-01-02"], name="index_col"), + CategoricalIndex(["A", "B", "C"], categories=["A", "B", "C"], name="index_col"), + RangeIndex(start=0, stop=3, name="index_col"), + DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"], name="index_col"), ], ids=lambda x: f"{type(x).__name__}[{x.dtype}]", ) def test_merge_index_types(index): # gh-20777 # assert key access is consistent across index types - left = DataFrame({"left_data": [1, 2]}, index=index) - right = DataFrame({"right_data": [1.0, 2.0]}, index=index) + left = DataFrame({"left_data": [1, 2, 3]}, index=index) + right = DataFrame({"right_data": [1.0, 2.0, 3.0]}, index=index) result = left.merge(right, on=["index_col"]) - expected = DataFrame({"left_data": [1, 2], "right_data": [1.0, 2.0]}, index=index) + expected = DataFrame( + {"left_data": [1, 2, 3], "right_data": [1.0, 2.0, 3.0]}, index=index + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 33d9a721df6b7..7ae2fffa04205 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -814,10 +814,12 @@ def test_join_multi_levels2(self): class TestJoinMultiMulti: def test_join_multi_multi(self, left_multi, right_multi, join_type, on_cols_multi): + left_names = left_multi.index.names + right_names = right_multi.index.names if join_type == "right": - level_order = ["Origin", "Destination", "Period", "LinkType", "TripPurp"] + level_order = right_names + left_names.difference(right_names) else: - level_order = ["Origin", "Destination", "Period", "TripPurp", "LinkType"] + level_order = left_names + right_names.difference(left_names) # Multi-index join tests expected = ( merge( @@ -839,10 +841,12 @@ def test_join_multi_empty_frames( left_multi = left_multi.drop(columns=left_multi.columns) right_multi = right_multi.drop(columns=right_multi.columns) + left_names = left_multi.index.names + right_names = right_multi.index.names if join_type == "right": - level_order = ["Origin", "Destination", "Period", "LinkType", "TripPurp"] + level_order = right_names + left_names.difference(right_names) else: - level_order = ["Origin", "Destination", "Period", "TripPurp", "LinkType"] + level_order = left_names + right_names.difference(left_names) expected = ( merge( diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index c4af63fe5cc81..070c756e8c928 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -135,7 +135,7 @@ def test_crosstab_margins(self): result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True) assert result.index.names == ("a",) - assert result.columns.names == ("b", "c") + assert result.columns.names == ["b", "c"] all_cols = result["All", ""] exp_cols = df.groupby(["a"]).size().astype("i8") @@ -173,7 +173,7 @@ def test_crosstab_margins_set_margin_name(self): ) assert result.index.names == ("a",) - assert result.columns.names == ("b", "c") + assert result.columns.names == ["b", "c"] all_cols = result["TOTAL", ""] exp_cols = df.groupby(["a"]).size().astype("i8") diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index f750d5e7fa919..97f06b0e379f4 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -819,7 +819,7 @@ def test_pivot_columns_none_raise_error(self): df = DataFrame({"col1": ["a", "b", "c"], "col2": [1, 2, 3], "col3": [1, 2, 3]}) msg = r"pivot\(\) missing 1 required keyword-only argument: 'columns'" with pytest.raises(TypeError, match=msg): - df.pivot(index="col1", values="col3") # pylint: disable=missing-kwoa + df.pivot(index="col1", values="col3") @pytest.mark.xfail( reason="MultiIndexed unstack with tuple names fails with KeyError GH#19966" @@ -2600,7 +2600,7 @@ def test_pivot_columns_not_given(self): # GH#48293 df = DataFrame({"a": [1], "b": 1}) with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): - df.pivot() # pylint: disable=missing-kwoa + df.pivot() @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") def test_pivot_columns_is_none(self): @@ -2703,3 +2703,16 @@ def test_pivot_table_with_margins_and_numeric_column_names(self): index=Index(["a", "b", "All"], name=0), ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("m", [1, 10]) + def test_unstack_shares_memory(self, m): + # GH#56633 + levels = np.arange(m) + index = MultiIndex.from_product([levels] * 2) + values = np.arange(m * m * 100).reshape(m * m, 100) + df = DataFrame(values, index, np.arange(100)) + df_orig = df.copy() + result = df.unstack(sort=False) + assert np.shares_memory(df._values, result._values) is (m == 1) + result.iloc[0, 0] = -1 + tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 96721f11cb2d6..efeca375affbb 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -1180,5 +1180,5 @@ def test_ops_error_str(): with pytest.raises(TypeError, match=msg): left > right - assert not left == right # pylint: disable=unneeded-not + assert not left == right assert left != right diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 73b2da0f7dd50..01e7ba52e58aa 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -11,6 +11,7 @@ import pytest from pandas._libs import lib +from pandas._libs.missing import NA from pandas._libs.tslibs import ( NaT, iNaT, @@ -138,6 +139,19 @@ def test_truediv_numeric(self, td): assert res._value == td._value / 2 assert res._creso == td._creso + def test_truediv_na_type_not_supported(self, td): + msg_td_floordiv_na = ( + r"unsupported operand type\(s\) for /: 'Timedelta' and 'NAType'" + ) + with pytest.raises(TypeError, match=msg_td_floordiv_na): + td / NA + + msg_na_floordiv_td = ( + r"unsupported operand type\(s\) for /: 'NAType' and 'Timedelta'" + ) + with pytest.raises(TypeError, match=msg_na_floordiv_td): + NA / td + def test_floordiv_timedeltalike(self, td): assert td // td == 1 assert (2.5 * td) // td == 2 @@ -182,6 +196,19 @@ def test_floordiv_numeric(self, td): assert res._value == td._value // 2 assert res._creso == td._creso + def test_floordiv_na_type_not_supported(self, td): + msg_td_floordiv_na = ( + r"unsupported operand type\(s\) for //: 'Timedelta' and 'NAType'" + ) + with pytest.raises(TypeError, match=msg_td_floordiv_na): + td // NA + + msg_na_floordiv_td = ( + r"unsupported operand type\(s\) for //: 'NAType' and 'Timedelta'" + ) + with pytest.raises(TypeError, match=msg_na_floordiv_td): + NA // td + def test_addsub_mismatched_reso(self, td): # need to cast to since td is out of bounds for ns, so # so we would raise OverflowError without casting diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index e7e5541cf499f..b2590c43e1ece 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -309,5 +309,5 @@ def __eq__(self, other) -> bool: for left, right in [(inf, timestamp), (timestamp, inf)]: assert left > right or left < right assert left >= right or left <= right - assert not left == right # pylint: disable=unneeded-not + assert not left == right assert left != right diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index bbda9d3ee7dce..4ebdea3733484 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -621,7 +621,6 @@ def test_constructor_with_stringoffset(self): ] timezones = [ - (None, 0), ("UTC", 0), (pytz.utc, 0), ("Asia/Tokyo", 9), @@ -1013,6 +1012,18 @@ def test_timestamp_constructed_by_date_and_tz(self, tz): assert result.hour == expected.hour assert result == expected + def test_explicit_tz_none(self): + # GH#48688 + msg = "Passed data is timezone-aware, incompatible with 'tz=None'" + with pytest.raises(ValueError, match=msg): + Timestamp(datetime(2022, 1, 1, tzinfo=timezone.utc), tz=None) + + with pytest.raises(ValueError, match=msg): + Timestamp("2022-01-01 00:00:00", tzinfo=timezone.utc, tz=None) + + with pytest.raises(ValueError, match=msg): + Timestamp("2022-01-01 00:00:00-0400", tz=None) + def test_constructor_ambiguous_dst(): # GH 24329 diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index b4493088acb31..e1299c272e5cc 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -118,7 +118,7 @@ def test_repr(self, date, freq, tz): def test_repr_utcoffset(self): # This can cause the tz field to be populated, but it's redundant to # include this information in the date-string. - date_with_utc_offset = Timestamp("2014-03-13 00:00:00-0400", tz=None) + date_with_utc_offset = Timestamp("2014-03-13 00:00:00-0400") assert "2014-03-13 00:00:00-0400" in repr(date_with_utc_offset) assert "tzoffset" not in repr(date_with_utc_offset) assert "UTC-04:00" in repr(date_with_utc_offset) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index c5df1fd498938..1008c2c87dc9e 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -790,11 +790,9 @@ def test_interpolate_unsorted_index(self, ascending, expected_values): def test_interpolate_asfreq_raises(self): ser = Series(["a", None, "b"], dtype=object) - msg2 = "Series cannot interpolate with object dtype" - msg = "Invalid fill method" - with pytest.raises(TypeError, match=msg2): - with pytest.raises(ValueError, match=msg): - ser.interpolate(method="asfreq") + msg = "Can not interpolate with method=asfreq" + with pytest.raises(ValueError, match=msg): + ser.interpolate(method="asfreq") def test_interpolate_fill_value(self): # GH#54920 diff --git a/pandas/tests/series/methods/test_rename_axis.py b/pandas/tests/series/methods/test_rename_axis.py index 60175242a06b5..58c095d697ede 100644 --- a/pandas/tests/series/methods/test_rename_axis.py +++ b/pandas/tests/series/methods/test_rename_axis.py @@ -15,13 +15,13 @@ def test_rename_axis_mapper(self): ser = Series(list(range(len(mi))), index=mi) result = ser.rename_axis(index={"ll": "foo"}) - assert result.index.names == ("foo", "nn") + assert result.index.names == ["foo", "nn"] result = ser.rename_axis(index=str.upper, axis=0) - assert result.index.names == ("LL", "NN") + assert result.index.names == ["LL", "NN"] result = ser.rename_axis(index=["foo", "goo"]) - assert result.index.names == ("foo", "goo") + assert result.index.names == ["foo", "goo"] with pytest.raises(TypeError, match="unexpected"): ser.rename_axis(columns="wrong") diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 09a3469e73462..0a79bcea679a7 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -137,20 +137,15 @@ def test_replace_gh5319(self): # API change from 0.12? # GH 5319 ser = pd.Series([0, np.nan, 2, 3, 4]) - expected = ser.ffill() msg = ( - "Series.replace without 'value' and with non-dict-like " - "'to_replace' is deprecated" + "Series.replace must specify either 'value', " + "a dict-like 'to_replace', or dict-like 'regex'" ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace([np.nan]) - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match=msg): + ser.replace([np.nan]) - ser = pd.Series([0, np.nan, 2, 3, 4]) - expected = ser.ffill() - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace(np.nan) - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match=msg): + ser.replace(np.nan) def test_replace_datetime64(self): # GH 5797 @@ -182,19 +177,16 @@ def test_replace_timedelta_td64(self): def test_replace_with_single_list(self): ser = pd.Series([0, 1, 2, 3, 4]) - msg2 = ( - "Series.replace without 'value' and with non-dict-like " - "'to_replace' is deprecated" + msg = ( + "Series.replace must specify either 'value', " + "a dict-like 'to_replace', or dict-like 'regex'" ) - with tm.assert_produces_warning(FutureWarning, match=msg2): - result = ser.replace([1, 2, 3]) - tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4])) + with pytest.raises(ValueError, match=msg): + ser.replace([1, 2, 3]) s = ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg2): - return_value = s.replace([1, 2, 3], inplace=True) - assert return_value is None - tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4])) + with pytest.raises(ValueError, match=msg): + s.replace([1, 2, 3], inplace=True) def test_replace_mixed_types(self): ser = pd.Series(np.arange(5), dtype="int64") @@ -483,13 +475,8 @@ def test_replace_invalid_to_replace(self): r"Expecting 'to_replace' to be either a scalar, array-like, " r"dict or None, got invalid type.*" ) - msg2 = ( - "Series.replace without 'value' and with non-dict-like " - "'to_replace' is deprecated" - ) with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - series.replace(lambda x: x.strip()) + series.replace(lambda x: x.strip()) @pytest.mark.parametrize("frame", [False, True]) def test_replace_nonbool_regex(self, frame): diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index e292861012c8f..f7dec02ab0e5b 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -24,58 +24,55 @@ def read_csv(self, path, **kwargs): return out - def test_from_csv(self, datetime_series, string_series): + def test_from_csv(self, datetime_series, string_series, temp_file): # freq doesn't round-trip datetime_series.index = datetime_series.index._with_freq(None) - with tm.ensure_clean() as path: - datetime_series.to_csv(path, header=False) - ts = self.read_csv(path, parse_dates=True) - tm.assert_series_equal(datetime_series, ts, check_names=False) + path = temp_file + datetime_series.to_csv(path, header=False) + ts = self.read_csv(path, parse_dates=True) + tm.assert_series_equal(datetime_series, ts, check_names=False) - assert ts.name is None - assert ts.index.name is None + assert ts.name is None + assert ts.index.name is None - # see gh-10483 - datetime_series.to_csv(path, header=True) - ts_h = self.read_csv(path, header=0) - assert ts_h.name == "ts" + # see gh-10483 + datetime_series.to_csv(path, header=True) + ts_h = self.read_csv(path, header=0) + assert ts_h.name == "ts" - string_series.to_csv(path, header=False) - series = self.read_csv(path) - tm.assert_series_equal(string_series, series, check_names=False) + string_series.to_csv(path, header=False) + series = self.read_csv(path) + tm.assert_series_equal(string_series, series, check_names=False) - assert series.name is None - assert series.index.name is None + assert series.name is None + assert series.index.name is None - string_series.to_csv(path, header=True) - series_h = self.read_csv(path, header=0) - assert series_h.name == "series" + string_series.to_csv(path, header=True) + series_h = self.read_csv(path, header=0) + assert series_h.name == "series" - with open(path, "w", encoding="utf-8") as outfile: - outfile.write("1998-01-01|1.0\n1999-01-01|2.0") + with open(path, "w", encoding="utf-8") as outfile: + outfile.write("1998-01-01|1.0\n1999-01-01|2.0") - series = self.read_csv(path, sep="|", parse_dates=True) - check_series = Series( - {datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0} - ) - tm.assert_series_equal(check_series, series) + series = self.read_csv(path, sep="|", parse_dates=True) + check_series = Series({datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0}) + tm.assert_series_equal(check_series, series) - series = self.read_csv(path, sep="|", parse_dates=False) - check_series = Series({"1998-01-01": 1.0, "1999-01-01": 2.0}) - tm.assert_series_equal(check_series, series) + series = self.read_csv(path, sep="|", parse_dates=False) + check_series = Series({"1998-01-01": 1.0, "1999-01-01": 2.0}) + tm.assert_series_equal(check_series, series) - def test_to_csv(self, datetime_series): - with tm.ensure_clean() as path: - datetime_series.to_csv(path, header=False) + def test_to_csv(self, datetime_series, temp_file): + datetime_series.to_csv(temp_file, header=False) - with open(path, newline=None, encoding="utf-8") as f: - lines = f.readlines() - assert lines[1] != "\n" + with open(temp_file, newline=None, encoding="utf-8") as f: + lines = f.readlines() + assert lines[1] != "\n" - datetime_series.to_csv(path, index=False, header=False) - arr = np.loadtxt(path) - tm.assert_almost_equal(arr, datetime_series.values) + datetime_series.to_csv(temp_file, index=False, header=False) + arr = np.loadtxt(temp_file) + tm.assert_almost_equal(arr, datetime_series.values) def test_to_csv_unicode_index(self): buf = StringIO() @@ -87,14 +84,13 @@ def test_to_csv_unicode_index(self): s2 = self.read_csv(buf, index_col=0, encoding="UTF-8") tm.assert_series_equal(s, s2) - def test_to_csv_float_format(self): - with tm.ensure_clean() as filename: - ser = Series([0.123456, 0.234567, 0.567567]) - ser.to_csv(filename, float_format="%.2f", header=False) + def test_to_csv_float_format(self, temp_file): + ser = Series([0.123456, 0.234567, 0.567567]) + ser.to_csv(temp_file, float_format="%.2f", header=False) - rs = self.read_csv(filename) - xp = Series([0.12, 0.23, 0.57]) - tm.assert_series_equal(rs, xp) + rs = self.read_csv(temp_file) + xp = Series([0.12, 0.23, 0.57]) + tm.assert_series_equal(rs, xp) def test_to_csv_list_entries(self): s = Series(["jack and jill", "jesse and frank"]) @@ -128,50 +124,49 @@ def test_to_csv_path_is_none(self): ), ], ) - def test_to_csv_compression(self, s, encoding, compression): - with tm.ensure_clean() as filename: - s.to_csv(filename, compression=compression, encoding=encoding, header=True) - # test the round trip - to_csv -> read_csv - result = pd.read_csv( - filename, - compression=compression, - encoding=encoding, - index_col=0, - ).squeeze("columns") - tm.assert_series_equal(s, result) - - # test the round trip using file handle - to_csv -> read_csv - with get_handle( - filename, "w", compression=compression, encoding=encoding - ) as handles: - s.to_csv(handles.handle, encoding=encoding, header=True) - - result = pd.read_csv( - filename, - compression=compression, - encoding=encoding, - index_col=0, - ).squeeze("columns") - tm.assert_series_equal(s, result) - - # explicitly ensure file was compressed - with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode(encoding or "utf8") - assert s.name in text - - with tm.decompress_file(filename, compression) as fh: - tm.assert_series_equal( - s, - pd.read_csv(fh, index_col=0, encoding=encoding).squeeze("columns"), - ) - - def test_to_csv_interval_index(self, using_infer_string): + def test_to_csv_compression(self, s, encoding, compression, temp_file): + filename = temp_file + s.to_csv(filename, compression=compression, encoding=encoding, header=True) + # test the round trip - to_csv -> read_csv + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + ).squeeze("columns") + tm.assert_series_equal(s, result) + + # test the round trip using file handle - to_csv -> read_csv + with get_handle( + filename, "w", compression=compression, encoding=encoding + ) as handles: + s.to_csv(handles.handle, encoding=encoding, header=True) + + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + ).squeeze("columns") + tm.assert_series_equal(s, result) + + # explicitly ensure file was compressed + with tm.decompress_file(filename, compression) as fh: + text = fh.read().decode(encoding or "utf8") + assert s.name in text + + with tm.decompress_file(filename, compression) as fh: + tm.assert_series_equal( + s, + pd.read_csv(fh, index_col=0, encoding=encoding).squeeze("columns"), + ) + + def test_to_csv_interval_index(self, using_infer_string, temp_file): # GH 28210 s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) - with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path: - s.to_csv(path, header=False) - result = self.read_csv(path, index_col=0) + s.to_csv(temp_file, header=False) + result = self.read_csv(temp_file, index_col=0) # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = s diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 7b45a267a4572..a63ffbbd3a5a1 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -107,7 +107,7 @@ def test_contains(self, datetime_series): def test_axis_alias(self): s = Series([1, 2, np.nan]) tm.assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index")) - assert s.dropna().sum("rows") == 3 + assert s.dropna().sum(axis="rows") == 3 assert s._get_axis_number("rows") == 0 assert s._get_axis_name("rows") == "index" diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 00f48bf3b1d78..44bf3475b85a6 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -807,9 +807,6 @@ def test_series_ops_name_retention(self, flex, box, names, all_binary_operators) r"Logical ops \(and, or, xor\) between Pandas objects and " "dtype-less sequences" ) - warn = None - if box in [list, tuple] and is_logical: - warn = FutureWarning right = box(right) if flex: @@ -818,9 +815,12 @@ def test_series_ops_name_retention(self, flex, box, names, all_binary_operators) return result = getattr(left, name)(right) else: - # GH#37374 logical ops behaving as set ops deprecated - with tm.assert_produces_warning(warn, match=msg): - result = op(left, right) + if is_logical and box in [list, tuple]: + with pytest.raises(TypeError, match=msg): + # GH#52264 logical ops with dtype-less sequences deprecated + op(left, right) + return + result = op(left, right) assert isinstance(result, Series) if box in [Index, Series]: diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 68737e86f0c6a..3f9d5bbe806bb 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2176,6 +2176,25 @@ def test_series_constructor_infer_multiindex(self, container, data): multi = Series(data, index=indexes) assert isinstance(multi.index, MultiIndex) + # TODO: make this not cast to object in pandas 3.0 + @pytest.mark.skipif( + not np_version_gt2, reason="StringDType only available in numpy 2 and above" + ) + @pytest.mark.parametrize( + "data", + [ + ["a", "b", "c"], + ["a", "b", np.nan], + ], + ) + def test_np_string_array_object_cast(self, data): + from numpy.dtypes import StringDType + + arr = np.array(data, dtype=StringDType()) + res = Series(arr) + assert res.dtype == np.object_ + assert (res == data).all() + class TestSeriesConstructorInternals: def test_constructor_no_pandas_array(self): @@ -2251,3 +2270,9 @@ def test_series_with_complex_nan(input_list): result = Series(ser.array) assert ser.dtype == "complex128" tm.assert_series_equal(ser, result) + + +def test_dict_keys_rangeindex(): + result = Series({0: 1, 1: 2}) + expected = Series([1, 2], index=RangeIndex(2)) + tm.assert_series_equal(result, expected, check_index_type=True) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index b76b69289b72f..f59eacea3fe6c 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -86,7 +86,7 @@ def test_logical_operators_int_dtype_with_float(self): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") - warn_msg = ( + err_msg = ( r"Logical ops \(and, or, xor\) between Pandas objects and " "dtype-less sequences" ) @@ -97,9 +97,8 @@ def test_logical_operators_int_dtype_with_float(self): with pytest.raises(TypeError, match=msg): s_0123 & 3.14 msg = "unsupported operand type.+for &:" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s_0123 & [0.1, 4, 3.14, 2] + with pytest.raises(TypeError, match=err_msg): + s_0123 & [0.1, 4, 3.14, 2] with pytest.raises(TypeError, match=msg): s_0123 & np.array([0.1, 4, 3.14, 2]) with pytest.raises(TypeError, match=msg): @@ -108,7 +107,7 @@ def test_logical_operators_int_dtype_with_float(self): def test_logical_operators_int_dtype_with_str(self): s_1111 = Series([1] * 4, dtype="int8") - warn_msg = ( + err_msg = ( r"Logical ops \(and, or, xor\) between Pandas objects and " "dtype-less sequences" ) @@ -116,9 +115,8 @@ def test_logical_operators_int_dtype_with_str(self): msg = "Cannot perform 'and_' with a dtyped.+array and scalar of type" with pytest.raises(TypeError, match=msg): s_1111 & "a" - with pytest.raises(TypeError, match="unsupported operand.+for &"): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s_1111 & ["a", "b", "c", "d"] + with pytest.raises(TypeError, match=err_msg): + s_1111 & ["a", "b", "c", "d"] def test_logical_operators_int_dtype_with_bool(self): # GH#9016: support bitwise op for integer types @@ -129,17 +127,15 @@ def test_logical_operators_int_dtype_with_bool(self): result = s_0123 & False tm.assert_series_equal(result, expected) - warn_msg = ( + msg = ( r"Logical ops \(and, or, xor\) between Pandas objects and " "dtype-less sequences" ) - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - result = s_0123 & [False] - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match=msg): + s_0123 & [False] - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - result = s_0123 & (False,) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match=msg): + s_0123 & (False,) result = s_0123 ^ False expected = Series([False, True, True, True]) @@ -188,9 +184,8 @@ def test_logical_ops_bool_dtype_with_ndarray(self): ) expected = Series([True, False, False, False, False]) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = left & right - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match=msg): + left & right result = left & np.array(right) tm.assert_series_equal(result, expected) result = left & Index(right) @@ -199,9 +194,8 @@ def test_logical_ops_bool_dtype_with_ndarray(self): tm.assert_series_equal(result, expected) expected = Series([True, True, True, True, True]) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = left | right - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match=msg): + left | right result = left | np.array(right) tm.assert_series_equal(result, expected) result = left | Index(right) @@ -210,9 +204,8 @@ def test_logical_ops_bool_dtype_with_ndarray(self): tm.assert_series_equal(result, expected) expected = Series([False, True, True, True, True]) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = left ^ right - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match=msg): + left ^ right result = left ^ np.array(right) tm.assert_series_equal(result, expected) result = left ^ Index(right) @@ -233,26 +226,22 @@ def test_logical_operators_int_dtype_with_bool_dtype_and_reindex(self): # s_0123 will be all false now because of reindexing like s_tft expected = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) - with tm.assert_produces_warning(FutureWarning): - result = s_tft & s_0123 + result = s_tft & s_0123 tm.assert_series_equal(result, expected) - # GH 52538: Deprecate casting to object type when reindex is needed; + # GH#52538: no longer to object type when reindex is needed; # matches DataFrame behavior - expected = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) - with tm.assert_produces_warning(FutureWarning): - result = s_0123 & s_tft - tm.assert_series_equal(result, expected) + msg = r"unsupported operand type\(s\) for &: 'float' and 'bool'" + with pytest.raises(TypeError, match=msg): + s_0123 & s_tft s_a0b1c0 = Series([1], list("b")) - with tm.assert_produces_warning(FutureWarning): - res = s_tft & s_a0b1c0 + res = s_tft & s_a0b1c0 expected = s_tff.reindex(list("abc")) tm.assert_series_equal(res, expected) - with tm.assert_produces_warning(FutureWarning): - res = s_tft | s_a0b1c0 + res = s_tft | s_a0b1c0 expected = s_tft.reindex(list("abc")) tm.assert_series_equal(res, expected) @@ -273,9 +262,8 @@ def test_scalar_na_logical_ops_corners(self): r"Logical ops \(and, or, xor\) between Pandas objects and " "dtype-less sequences" ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s & list(s) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match=msg): + s & list(s) def test_scalar_na_logical_ops_corners_aligns(self): s = Series([2, 3, 4, 5, 6, 7, 8, 9, datetime(2005, 1, 1)]) @@ -405,27 +393,24 @@ def test_logical_ops_label_based(self, using_infer_string): tm.assert_series_equal(result, expected) # vs non-matching - with tm.assert_produces_warning(FutureWarning): - result = a & Series([1], ["z"]) + result = a & Series([1], ["z"]) expected = Series([False, False, False, False], list("abcz")) tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - result = a | Series([1], ["z"]) + result = a | Series([1], ["z"]) expected = Series([True, True, False, False], list("abcz")) tm.assert_series_equal(result, expected) # identity # we would like s[s|e] == s to hold for any e, whether empty or not - with tm.assert_produces_warning(FutureWarning): - for e in [ - empty.copy(), - Series([1], ["z"]), - Series(np.nan, b.index), - Series(np.nan, a.index), - ]: - result = a[a | e] - tm.assert_series_equal(result, a[a]) + for e in [ + empty.copy(), + Series([1], ["z"]), + Series(np.nan, b.index), + Series(np.nan, a.index), + ]: + result = a[a | e] + tm.assert_series_equal(result, a[a]) for e in [Series(["z"])]: warn = FutureWarning if using_infer_string else None @@ -519,7 +504,6 @@ def test_logical_ops_df_compat(self): tm.assert_frame_equal(s3.to_frame() | s4.to_frame(), exp_or1.to_frame()) tm.assert_frame_equal(s4.to_frame() | s3.to_frame(), exp_or.to_frame()) - @pytest.mark.xfail(reason="Will pass once #52839 deprecation is enforced") def test_int_dtype_different_index_not_bool(self): # GH 52500 ser1 = Series([1, 2, 3], index=[10, 11, 23], name="a") diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 94a6910509e2d..36a2afb2162c2 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -289,7 +289,7 @@ def test_multiply(self, values_for_np_reduce, box_with_array, request): else: msg = "|".join( [ - "does not support reduction", + "does not support operation", "unsupported operand type", "ufunc 'multiply' cannot use operands", ] @@ -319,7 +319,7 @@ def test_add(self, values_for_np_reduce, box_with_array): else: msg = "|".join( [ - "does not support reduction", + "does not support operation", "unsupported operand type", "ufunc 'add' cannot use operands", ] diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 9c2b9a76bbb83..bcecd1b2d5eec 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -205,6 +205,18 @@ class MyList(list): val = MyList([True]) assert com.is_bool_indexer(val) + def test_frozenlist(self): + # GH#42461 + data = {"col1": [1, 2], "col2": [3, 4]} + df = pd.DataFrame(data=data) + + frozen = df.index.names[1:] + assert not com.is_bool_indexer(frozen) + + result = df[frozen] + expected = df[[]] + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("with_exception", [True, False]) def test_temp_setattr(with_exception): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 4e2af9fef377b..97e0fa93c90ef 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -135,7 +135,7 @@ def test_multilevel_consolidate(self): df = DataFrame( np.random.default_rng(2).standard_normal((4, 4)), index=index, columns=index ) - df["Totals", ""] = df.sum(1) + df["Totals", ""] = df.sum(axis=1) df = df._consolidate() def test_level_with_tuples(self): diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 7992b48a4b0cc..b59dd194cac27 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3545,19 +3545,27 @@ def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_fir # issued in _array_to_datetime_object both_strs = isinstance(aware_val, str) and isinstance(naive_val, str) has_numeric = isinstance(naive_val, (int, float)) + both_datetime = isinstance(naive_val, datetime) and isinstance(aware_val, datetime) + + mixed_msg = ( + "Mixed timezones detected. Pass utc=True in to_datetime or tz='UTC' " + "in DatetimeIndex to convert to a common timezone" + ) first_non_null = next(x for x in vec if x != "") # if first_non_null is a not a string, _guess_datetime_format_for_array # doesn't guess a format so we don't go through array_strptime if not isinstance(first_non_null, str): # that case goes through array_strptime which has different behavior - msg = "Cannot mix tz-aware with tz-naive values" + msg = mixed_msg if naive_first and isinstance(aware_val, Timestamp): if isinstance(naive_val, Timestamp): msg = "Tz-aware datetime.datetime cannot be converted to datetime64" with pytest.raises(ValueError, match=msg): to_datetime(vec) else: + if not naive_first and both_datetime: + msg = "Cannot mix tz-aware with tz-naive values" with pytest.raises(ValueError, match=msg): to_datetime(vec) @@ -3586,7 +3594,7 @@ def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_fir to_datetime(vec, utc=True) else: - msg = "Mixed timezones detected. Pass utc=True in to_datetime" + msg = mixed_msg with pytest.raises(ValueError, match=msg): to_datetime(vec) @@ -3594,13 +3602,13 @@ def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_fir to_datetime(vec, utc=True) if both_strs: - msg = "Mixed timezones detected. Pass utc=True in to_datetime" + msg = mixed_msg with pytest.raises(ValueError, match=msg): to_datetime(vec, format="mixed") with pytest.raises(ValueError, match=msg): DatetimeIndex(vec) else: - msg = "Cannot mix tz-aware with tz-naive values" + msg = mixed_msg if naive_first and isinstance(aware_val, Timestamp): if isinstance(naive_val, Timestamp): msg = "Tz-aware datetime.datetime cannot be converted to datetime64" @@ -3609,6 +3617,8 @@ def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_fir with pytest.raises(ValueError, match=msg): DatetimeIndex(vec) else: + if not naive_first and both_datetime: + msg = "Cannot mix tz-aware with tz-naive values" with pytest.raises(ValueError, match=msg): to_datetime(vec, format="mixed") with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index 78ff774c188fe..dc6efdcec380e 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -198,8 +198,8 @@ def test_index_equal_names(name1, name2): msg = f"""Index are different Attribute "names" are different -\\[left\\]: \\({name1},\\) -\\[right\\]: \\({name2},\\)""" +\\[left\\]: \\[{name1}\\] +\\[right\\]: \\[{name2}\\]""" with pytest.raises(AssertionError, match=msg): tm.assert_index_equal(idx1, idx2) diff --git a/pandas/tests/util/test_assert_produces_warning.py b/pandas/tests/util/test_assert_produces_warning.py index 80e3264690f81..5b917dbbe7ba7 100644 --- a/pandas/tests/util/test_assert_produces_warning.py +++ b/pandas/tests/util/test_assert_produces_warning.py @@ -42,7 +42,6 @@ def f(): warnings.warn("f2", RuntimeWarning) -@pytest.mark.filterwarnings("ignore:f1:FutureWarning") def test_assert_produces_warning_honors_filter(): # Raise by default. msg = r"Caused unexpected warning\(s\)" @@ -180,6 +179,44 @@ def test_match_multiple_warnings(): warnings.warn("Match this too", UserWarning) +def test_must_match_multiple_warnings(): + # https://github.com/pandas-dev/pandas/issues/56555 + category = (FutureWarning, UserWarning) + msg = "Did not see expected warning of class 'UserWarning'" + with pytest.raises(AssertionError, match=msg): + with tm.assert_produces_warning(category, match=r"^Match this"): + warnings.warn("Match this", FutureWarning) + + +def test_must_match_multiple_warnings_messages(): + # https://github.com/pandas-dev/pandas/issues/56555 + category = (FutureWarning, UserWarning) + msg = r"The emitted warning messages are \[UserWarning\('Not this'\)\]" + with pytest.raises(AssertionError, match=msg): + with tm.assert_produces_warning(category, match=r"^Match this"): + warnings.warn("Match this", FutureWarning) + warnings.warn("Not this", UserWarning) + + +def test_allow_partial_match_for_multiple_warnings(): + # https://github.com/pandas-dev/pandas/issues/56555 + category = (FutureWarning, UserWarning) + with tm.assert_produces_warning( + category, match=r"^Match this", must_find_all_warnings=False + ): + warnings.warn("Match this", FutureWarning) + + +def test_allow_partial_match_for_multiple_warnings_messages(): + # https://github.com/pandas-dev/pandas/issues/56555 + category = (FutureWarning, UserWarning) + with tm.assert_produces_warning( + category, match=r"^Match this", must_find_all_warnings=False + ): + warnings.warn("Match this", FutureWarning) + warnings.warn("Not this", UserWarning) + + def test_right_category_wrong_match_raises(pair_different_warnings): target_category, other_category = pair_different_warnings with pytest.raises(AssertionError, match="Did not see warning.*matching"): diff --git a/pandas/tests/util/test_deprecate_nonkeyword_arguments.py b/pandas/tests/util/test_deprecate_nonkeyword_arguments.py index e74ff89b11581..f81d32b574682 100644 --- a/pandas/tests/util/test_deprecate_nonkeyword_arguments.py +++ b/pandas/tests/util/test_deprecate_nonkeyword_arguments.py @@ -124,8 +124,7 @@ def test_i_signature(): class Foo: @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "bar"]) - def baz(self, bar=None, foobar=None): # pylint: disable=disallowed-name - ... + def baz(self, bar=None, foobar=None): ... def test_foo_signature(): diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 47bfc219d0fe9..85821ed2cfb6f 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -589,7 +589,7 @@ def test_multi_index_names(): result = df.rolling(3).cov() tm.assert_index_equal(result.columns, df.columns) - assert result.index.names == (None, "1", "2") + assert result.index.names == [None, "1", "2"] def test_rolling_axis_sum(): diff --git a/pyproject.toml b/pyproject.toml index 5f5b013ca8461..085c054f8241a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,12 +6,9 @@ requires = [ "meson==1.2.1", "wheel", "Cython~=3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json - # Any NumPy version should be fine for compiling. Users are unlikely - # to get a NumPy<1.25 so the result will be compatible with all relevant - # NumPy versions (if not it is presumably compatible with their version). - # Pin <2.0 for releases until tested against an RC. But explicitly allow - # testing the `.dev0` nightlies (which require the extra index). - "numpy>1.22.4,<=2.0.0.dev0", + # Force numpy higher than 2.0rc1, so that built wheels are compatible + # with both numpy 1 and 2 + "numpy>=2.0.0rc1", "versioneer[toml]" ] @@ -237,6 +234,12 @@ select = [ "G", # flake8-future-annotations "FA", + # unconventional-import-alias + "ICN001", + # flake8-slots + "SLOT", + # flake8-raise + "RSE" ] ignore = [ @@ -315,7 +318,61 @@ ignore = [ # pairwise-over-zipped (>=PY310 only) "RUF007", # mutable-class-default - "RUF012" + "RUF012", + + # Additional pylint rules + # literal-membership + "PLR6201", # 847 errors + # Method could be a function, class method, or static method + "PLR6301", # 11411 errors + # Private name import + "PLC2701", # 27 errors + # Too many positional arguments (6/5) + "PLR0917", # 470 errors + # compare-to-empty-string + "PLC1901", + # `tempfile.NamedTemporaryFile` in text mode without explicit `encoding` argument + "PLW1514", # 1 error + # Object does not implement `__hash__` method + "PLW1641", # 16 errors + # Bad or misspelled dunder method name + "PLW3201", # 69 errors, seems to be all false positive + # Unnecessary lookup of dictionary value by key + "PLR1733", # 5 errors, it seems like we wannt to ignore these + # Unnecessary lookup of list item by index + "PLR1736", # 4 errors, we're currently having inline pylint ignore + # empty-comment + "PLR2044", # autofixable + # Unpacking a dictionary in iteration without calling `.items()` + "PLE1141", # autofixable + # import-outside-toplevel + "PLC0415", + # unnecessary-dunder-call + "PLC2801", + # comparison-with-itself + "PLR0124", + # too-many-public-methods + "PLR0904", + # too-many-return-statements + "PLR0911", + # too-many-branches + "PLR0912", + # too-many-arguments + "PLR0913", + # too-many-locals + "PLR0914", + # too-many-statements + "PLR0915", + # too-many-boolean-expressions + "PLR0916", + # too-many-nested-blocks + "PLR1702", + # redefined-argument-from-local + "PLR1704", + # unnecessary-lambda + "PLW0108", + # global-statement + "PLW0603", ] exclude = [ @@ -364,110 +421,6 @@ mark-parentheses = false [tool.ruff.format] docstring-code-format = true -[tool.pylint.messages_control] -max-line-length = 88 -disable = [ - # intentionally turned off - "bad-mcs-classmethod-argument", - "broad-except", - "c-extension-no-member", - "comparison-with-itself", - "consider-using-enumerate", - "import-error", - "import-outside-toplevel", - "invalid-name", - "invalid-unary-operand-type", - "line-too-long", - "no-else-continue", - "no-else-raise", - "no-else-return", - "no-member", - "no-name-in-module", - "not-an-iterable", - "overridden-final-method", - "pointless-statement", - "redundant-keyword-arg", - "singleton-comparison", - "too-many-ancestors", - "too-many-arguments", - "too-many-boolean-expressions", - "too-many-branches", - "too-many-function-args", - "too-many-instance-attributes", - "too-many-locals", - "too-many-nested-blocks", - "too-many-public-methods", - "too-many-return-statements", - "too-many-statements", - "unexpected-keyword-arg", - "ungrouped-imports", - "unsubscriptable-object", - "unsupported-assignment-operation", - "unsupported-membership-test", - "unused-import", - "use-dict-literal", - "use-implicit-booleaness-not-comparison", - "use-implicit-booleaness-not-len", - "wrong-import-order", - "wrong-import-position", - "redefined-loop-name", - - # misc - "abstract-class-instantiated", - "no-value-for-parameter", - "undefined-variable", - "unpacking-non-sequence", - "used-before-assignment", - - # pylint type "C": convention, for programming standard violation - "missing-class-docstring", - "missing-function-docstring", - "missing-module-docstring", - "superfluous-parens", - "too-many-lines", - "unidiomatic-typecheck", - "unnecessary-dunder-call", - "unnecessary-lambda-assignment", - - # pylint type "R": refactor, for bad code smell - "consider-using-with", - "cyclic-import", - "duplicate-code", - "inconsistent-return-statements", - "redefined-argument-from-local", - "too-few-public-methods", - - # pylint type "W": warning, for python specific problems - "abstract-method", - "arguments-differ", - "arguments-out-of-order", - "arguments-renamed", - "attribute-defined-outside-init", - "broad-exception-raised", - "comparison-with-callable", - "dangerous-default-value", - "deprecated-module", - "eval-used", - "expression-not-assigned", - "fixme", - "global-statement", - "invalid-overridden-method", - "keyword-arg-before-vararg", - "possibly-unused-variable", - "protected-access", - "raise-missing-from", - "redefined-builtin", - "redefined-outer-name", - "self-cls-assignment", - "signature-differs", - "super-init-not-called", - "try-except-raise", - "unnecessary-lambda", - "unused-argument", - "unused-variable", - "using-constant-test" -] - [tool.pytest.ini_options] # sync minversion with pyproject.toml & install.rst minversion = "7.3.2" diff --git a/requirements-dev.txt b/requirements-dev.txt index a42ee1587961a..042f0a455de01 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -64,6 +64,7 @@ natsort numpydoc pydata-sphinx-theme==0.14 pytest-cython +docutils < 0.21 sphinx sphinx-design sphinx-copybutton diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index ca1dc0c961c42..ed7b9affe9a50 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -231,104 +231,6 @@ exclude = [ "env", ] -[tool.pylint.messages_control] -max-line-length = 88 -disable = [ - # intentionally turned off - "broad-except", - "c-extension-no-member", - "comparison-with-itself", - "import-error", - "import-outside-toplevel", - "invalid-name", - "invalid-unary-operand-type", - "line-too-long", - "no-else-continue", - "no-else-raise", - "no-else-return", - "no-member", - "no-name-in-module", - "not-an-iterable", - "overridden-final-method", - "pointless-statement", - "redundant-keyword-arg", - "singleton-comparison", - "too-many-ancestors", - "too-many-arguments", - "too-many-boolean-expressions", - "too-many-branches", - "too-many-function-args", - "too-many-instance-attributes", - "too-many-locals", - "too-many-nested-blocks", - "too-many-public-methods", - "too-many-return-statements", - "too-many-statements", - "unexpected-keyword-arg", - "ungrouped-imports", - "unsubscriptable-object", - "unsupported-assignment-operation", - "unsupported-membership-test", - "unused-import", - "use-implicit-booleaness-not-comparison", - "use-implicit-booleaness-not-len", - "wrong-import-order", - "wrong-import-position", - - # misc - "abstract-class-instantiated", - "no-value-for-parameter", - "undefined-variable", - "unpacking-non-sequence", - - # pylint type "C": convention, for programming standard violation - "missing-class-docstring", - "missing-function-docstring", - "missing-module-docstring", - "too-many-lines", - "unidiomatic-typecheck", - "unnecessary-dunder-call", - "unnecessary-lambda-assignment", - - # pylint type "R": refactor, for bad code smell - "consider-using-with", - "cyclic-import", - "duplicate-code", - "inconsistent-return-statements", - "redefined-argument-from-local", - "too-few-public-methods", - - # pylint type "W": warning, for python specific problems - "abstract-method", - "arguments-differ", - "arguments-out-of-order", - "arguments-renamed", - "attribute-defined-outside-init", - "comparison-with-callable", - "dangerous-default-value", - "deprecated-module", - "eval-used", - "expression-not-assigned", - "fixme", - "global-statement", - "invalid-overridden-method", - "keyword-arg-before-vararg", - "possibly-unused-variable", - "protected-access", - "raise-missing-from", - "redefined-builtin", - "redefined-outer-name", - "self-cls-assignment", - "signature-differs", - "super-init-not-called", - "try-except-raise", - "unnecessary-lambda", - "unspecified-encoding", - "unused-argument", - "unused-variable", - "using-constant-test" -] - [tool.pytest.ini_options] # sync minversion with pyproject.toml & install.rst minversion = "7.0" diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index d2e92bb971888..3bffd1f1987aa 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -36,7 +36,7 @@ def redundant_import(self, paramx=None, paramy=None) -> None: >>> import pandas as pd >>> df = pd.DataFrame(np.ones((3, 3)), ... columns=('a', 'b', 'c')) - >>> df.all(1) + >>> df.all(axis=1) 0 True 1 True 2 True diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index a732d3f83a40a..ba3123a07df4b 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -54,6 +54,7 @@ # TODO(4.0): GH#55043 - remove upon removal of CoW option "_get_option", "_fill_limit_area_1d", + "_make_block", } diff --git a/web/pandas/pdeps/0001-purpose-and-guidelines.md b/web/pandas/pdeps/0001-purpose-and-guidelines.md index 24c91fbab0808..bb15b8f997b11 100644 --- a/web/pandas/pdeps/0001-purpose-and-guidelines.md +++ b/web/pandas/pdeps/0001-purpose-and-guidelines.md @@ -6,7 +6,7 @@ [#51417](https://github.com/pandas-dev/pandas/pull/51417) - Author: [Marc Garcia](https://github.com/datapythonista), [Noa Tamir](https://github.com/noatamir) -- Revision: 2 +- Revision: 3 ## PDEP definition, purpose and scope @@ -56,31 +56,102 @@ advisor on the PDEP when it is submitted to the PDEP repository. ### Workflow +#### Rationale + +Our workflow was created to support and enable a consensus seeking process, and to provide clarity, +for current and future authors, as well as voting members. It is not a strict policy, and we +discourage any interpretation which seeks to take advantage of it in a way that could "force" or +"sneak" decisions in one way or another. We expect and encourage transparency, active discussion, +feedback, and compromise from all our community members. + +#### PDEP States + The possible states of a PDEP are: +- Draft - Under discussion - Accepted - Implemented - Rejected +- Withdrawn Next is described the workflow that PDEPs can follow. #### Submitting a PDEP -Proposing a PDEP is done by creating a PR adding a new file to `web/pdeps/`. -The file is a markdown file, you can use `web/pdeps/0001.md` as a reference +Proposing a PDEP is done by creating a PR adding a new file to `web/pandas/pdeps/`. +The file is a markdown file, you can use `web/pandas/pdeps/0001-purpose-and-guidelines.md` as a reference for the expected format. -The initial status of a PDEP will be `Status: Under discussion`. This will be changed to -`Status: Accepted` when the PDEP is ready and has the approval of the core team. +The initial status of a PDEP will be `Status: Draft`. This will be changed to +`Status: Under discussion` by the author(s), when they are ready to proceed with the decision +making process. -#### Accepted PDEP +#### PDEP Discussion Timeline + +A PDEP discussion will remain open for up to 60 days. This period aims to enable participation +from volunteers, who might not always be available to respond quickly, as well as provide ample +time to make changes based on suggestions and considerations offered by the participants. +Similarly, the following voting period will remain open for 15 days. + +To enable and encourage discussions on PDEPs, we follow a notification schedule. At each of the +following steps, the pandas team, and the pandas-dev mailing list are notified via GitHub and +E-mail: + +- Once a PDEP is ready for discussion. +- After 30 days, with a note that there is at most 30 days remaining for discussion, + and that a vote will be called for if no discussion occurs in the next 15 days. +- After 45 days, with a note that there is at most 15 days remaining for discussion, + and that a vote will be called for in 15 days. +- Once the voting period starts, after 60 days or in case of an earlier vote, with 15 days + remaining for voting. +- After 10 voting days, with 5 days remaining for voting. + +After 30 discussion days, in case 15 days passed without any new unaddressed comments, +the authors may close the discussion period preemptively, by sending an early reminder +of 15 days remaining until the voting period starts. + +#### Casting Votes + +As the voting period starts, a VOTE issue is created which links to the PDEP discussion pull request. +Each voting member, including author(s) with voting rights, may cast a vote by adding one of the following comments: + +- +1: approve. +- 0: abstain. + - Reason: A one sentence reason is required. +- -1: disapprove + - Reason: A one sentence reason is required. + +A disapprove vote requires prior participation in the PDEP discussion issue. -A PDEP can only be accepted by the core development team, if the proposal is considered -worth implementing. Decisions will be made based on the process detailed in the -[pandas governance document](https://github.com/pandas-dev/pandas-governance/blob/master/governance.md). -In general, more than one approval will be needed before the PR is merged. And -there should not be any `Request changes` review at the time of merging. +Comments made on the public VOTE issue by non-voting members will be deleted. + +Once the voting period ends, any voter may tally the votes in a comment, using the format: w-x-y-z, +where w stands for the total of approving, x of abstaining, y of disapproving votes cast, and z +of number of voting members who did not respond to the VOTE issue. The tally of the votes will state +if a quorum has been reached or not. + +#### Quorum and Majority + +For a PDEP vote to result in accepting the proposal, a quorum is required. All votes (including +abstentions) are counted towards the quorum. The quorum is computed as the lower of these two +values: + +- 11 voting members. +- 50% of voting members. + +Given a quorum, a majority of 70% of the non-abstaining votes is required as well, i.e. 70% of +the approving and disapproving votes must be in favor. + +Thus, abstaining votes count towards a quorum, but not towards a majority. A voting member might +choose to abstain when they have participated in the discussion, have some objections to the +proposal, but do not wish to stop the proposal from moving forward, nor indicate their full +support. + +If a quorum was not reached by the end of the voting period, the PDEP is not accepted. Its status +will change to rejected. + +#### Accepted PDEP Once a PDEP is accepted, any contributions can be made toward the implementation of the PDEP, with an open-ended completion timeline. Development of pandas is difficult to understand and @@ -109,6 +180,17 @@ discussion. A PDEP can be rejected for different reasons, for example good ideas that are not backward-compatible, and the breaking changes are not considered worth implementing. +The PDEP author(s) can also decide to withdraw the PDEP before a final decision +is made (`Status: Withdrawn`), when the PDEP authors themselves have decided +that the PDEP is actually a bad idea, or have accepted that it is not broadly +supported or a competing proposal is a better alternative. + +The author(s) may choose to resubmit a rejected or withdrawn PDEP. We expect +authors to use their judgement in that case, as to whether they believe more +discussion, or an amended proposal has the potential to lead to a different +result. A new PDEP is then created, which includes a link to the previously +rejected PDEP. + #### Invalid PDEP For submitted PDEPs that do not contain proper documentation, are out of scope, or @@ -184,6 +266,7 @@ hope can help clarify our meaning here: - 3 August 2022: Initial version ([GH-47938][47938]) - 15 February 2023: Version 2 ([GH-51417][51417]) clarifies the scope of PDEPs and adds examples +- 09 June 2023: Version 3 ([GH-53576][53576]) defines a structured decision making process for PDEPs [7217]: https://github.com/pandas-dev/pandas/pull/7217 [8074]: https://github.com/pandas-dev/pandas/issues/8074