diff --git a/.github/workflows/broken-linkcheck.yml b/.github/workflows/broken-linkcheck.yml deleted file mode 100644 index 134253848f710..0000000000000 --- a/.github/workflows/broken-linkcheck.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: Linkcheck -on: - schedule: - # Run monthly on the 1st day of the month - - cron: '0 0 1 * *' - pull_request: - paths: - - ".github/workflows/broken-linkcheck.yml" - - "doc/make.py" -jobs: - linkcheck: - if: false - runs-on: ubuntu-latest - defaults: - run: - shell: bash -el {0} - - steps: - - name: Checkout - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - - name: Set up Conda - uses: ./.github/actions/setup-conda - - - name: Build Pandas - uses: ./.github/actions/build_pandas - - - name: Run linkcheck script - working-directory: ./doc - run: | - set -o pipefail - python make.py linkcheck | tee linkcheck.txt - - - name: Display broken links - if: failure() - working-directory: ./doc - run: grep broken linkcheck.txt diff --git a/.github/workflows/cache-cleanup-daily.yml b/.github/workflows/cache-cleanup-daily.yml index 8eadfb2ccd2a9..517d9839f4832 100644 --- a/.github/workflows/cache-cleanup-daily.yml +++ b/.github/workflows/cache-cleanup-daily.yml @@ -6,7 +6,7 @@ on: jobs: cleanup: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 if: github.repository_owner == 'pandas-dev' permissions: actions: write diff --git a/.github/workflows/cache-cleanup.yml b/.github/workflows/cache-cleanup.yml index 099974141c1d1..e355c8bc8acf3 100644 --- a/.github/workflows/cache-cleanup.yml +++ b/.github/workflows/cache-cleanup.yml @@ -6,7 +6,8 @@ on: jobs: cleanup: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 + if: github.repository_owner == 'pandas-dev' steps: - name: Clean Cache run: | diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml index e31d9ab343459..053907e6cebbc 100644 --- a/.github/workflows/comment-commands.yml +++ b/.github/workflows/comment-commands.yml @@ -9,15 +9,6 @@ permissions: pull-requests: write jobs: - issue_assign: - runs-on: ubuntu-24.04 - if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' - concurrency: - group: ${{ github.actor }}-issue-assign - steps: - - run: | - echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" - curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees preview_docs: runs-on: ubuntu-24.04 if: github.event.issue.pull_request && github.event.comment.body == '/preview' @@ -28,64 +19,3 @@ jobs: with: previewer-server: "https://pandas.pydata.org/preview" artifact-job: "Doc Build and Upload" - asv_run: - runs-on: ubuntu-24.04 - # TODO: Support more benchmarking options later, against different branches, against self, etc - if: github.event.issue.pull_request && startsWith(github.event.comment.body, '@github-actions benchmark') - defaults: - run: - shell: bash -el {0} - env: - ENV_FILE: environment.yml - COMMENT: ${{github.event.comment.body}} - - concurrency: - # Set concurrency to prevent abuse(full runs are ~5.5 hours !!!) - # each user can only run one concurrent benchmark bot at a time - # We don't cancel in progress jobs, but if you want to benchmark multiple PRs, you're gonna have - # to wait - group: ${{ github.actor }}-asv - cancel-in-progress: false - - steps: - - name: Checkout - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - # Although asv sets up its own env, deps are still needed - # during discovery process - - name: Set up Conda - uses: ./.github/actions/setup-conda - - - name: Run benchmarks - id: bench - continue-on-error: true # asv will exit code 1 for regressions - run: | - # extracting the regex, see https://stackoverflow.com/a/36798723 - REGEX=$(echo "$COMMENT" | sed -n "s/^.*-b\s*\(\S*\).*$/\1/p") - cd asv_bench - asv check -E existing - git remote add upstream https://github.com/pandas-dev/pandas.git - git fetch upstream - asv machine --yes - asv continuous -f 1.1 -b $REGEX upstream/main HEAD - echo 'BENCH_OUTPUT<> $GITHUB_ENV - asv compare -f 1.1 upstream/main HEAD >> $GITHUB_ENV - echo 'EOF' >> $GITHUB_ENV - echo "REGEX=$REGEX" >> $GITHUB_ENV - - - uses: actions/github-script@v8 - env: - BENCH_OUTPUT: ${{env.BENCH_OUTPUT}} - REGEX: ${{env.REGEX}} - with: - script: | - const ENV_VARS = process.env - const run_url = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}` - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: '\nBenchmarks completed. View runner logs here.' + run_url + '\nRegex used: '+ 'regex ' + ENV_VARS["REGEX"] + '\n' + ENV_VARS["BENCH_OUTPUT"] - }) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 51901919e4adb..f708308b7ec9f 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -182,7 +182,7 @@ jobs: strategy: matrix: # Note: Don't use macOS latest since macos 14 appears to be arm64 only - os: [macos-13, macos-14, windows-latest] + os: [macos-13, macos-14, windows-2025] env_file: [actions-311.yaml, actions-312.yaml, actions-313.yaml] fail-fast: false runs-on: ${{ matrix.os }} @@ -322,7 +322,7 @@ jobs: fail-fast: false matrix: # Separate out macOS 13 and 14, since macOS 14 is arm64 only - os: [ubuntu-24.04, macOS-13, macOS-14, windows-latest] + os: [ubuntu-24.04, macOS-13, macOS-14, windows-2025] timeout-minutes: 90 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 6f9bb2c487cf8..7da88d634adfc 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -229,7 +229,7 @@ jobs: - build_sdist - build_wheels - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 environment: name: pypi @@ -243,6 +243,8 @@ jobs: with: path: dist # everything lands in ./dist/** + # TODO: This step can be probably be achieved by actions/download-artifact@v5 + # by specifying merge-multiple: true, and a glob pattern - name: Collect files run: | mkdir -p upload diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 007f8476af370..dd3fe8b916a57 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.11 + rev: v0.13.3 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -46,7 +46,7 @@ repos: - id: codespell types_or: [python, rst, markdown, cython, c] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.16.7 + rev: v0.17.0 hooks: - id: cython-lint - id: double-quote-cython-strings @@ -67,7 +67,7 @@ repos: - id: trailing-whitespace args: [--markdown-linebreak-ext=md] - repo: https://github.com/PyCQA/isort - rev: 6.0.1 + rev: 6.1.0 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade @@ -92,14 +92,14 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v21.1.0 + rev: v21.1.2 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] - repo: https://github.com/trim21/pre-commit-mirror-meson - rev: v1.9.0 + rev: v1.9.1 hooks: - id: meson-fmt args: ['--inplace'] diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 66178a88e3e31..a49afce8861cc 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -36,16 +36,14 @@ and `good first issue `_ are typically good for newer contributors. -Once you've found an interesting issue, it's a good idea to assign the issue to yourself, -so nobody else duplicates the work on it. On the Github issue, a comment with the exact -text ``take`` to automatically assign you the issue -(this will take seconds and may require refreshing the page to see it). +Once you've found an interesting issue, leave a comment with your intention +to start working on it. If somebody else has +already commented on issue but they have shown a lack of activity in the issue +or a pull request in the past 2-3 weeks, you may take it over. If for whatever reason you are not able to continue working with the issue, please -unassign it, so other people know it's available again. You can check the list of -assigned issues, since people may not be working in them anymore. If you want to work on one -that is assigned, feel free to kindly ask the current assignee if you can take it -(please allow at least a week of inactivity before considering work in the issue discontinued). +leave a comment on an issue, so other people know it's available again. You can check the list of +assigned issues, since people may not be working in them anymore. We have several :ref:`contributor community ` communication channels, which you are welcome to join, and ask questions as you figure things out. Among them are regular meetings for diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index dc0590f18751a..d317b920e7372 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -270,6 +270,42 @@ column with another DataFrame's index. indexed_df2 = df2.set_index("key") pd.merge(df1, indexed_df2, left_on="key", right_index=True) +:meth:`~pandas.merge` also supports joining on multiple columns by passing a list of column names. + +.. code-block:: sql + + SELECT * + FROM df1_multi + INNER JOIN df2_multi + ON df1_multi.key1 = df2_multi.key1 + AND df1_multi.key2 = df2_multi.key2; + +.. ipython:: python + + df1_multi = pd.DataFrame({ + "key1": ["A", "B", "C", "D"], + "key2": [1, 2, 3, 4], + "value": np.random.randn(4) + }) + df2_multi = pd.DataFrame({ + "key1": ["B", "D", "D", "E"], + "key2": [2, 4, 4, 5], + "value": np.random.randn(4) + }) + pd.merge(df1_multi, df2_multi, on=["key1", "key2"]) + +If the columns have different names between DataFrames, on can be replaced with left_on and +right_on. + +.. ipython:: python + + df2_multi = pd.DataFrame({ + "key_1": ["B", "D", "D", "E"], + "key_2": [2, 4, 4, 5], + "value": np.random.randn(4) + }) + pd.merge(df1_multi, df2_multi, left_on=["key1", "key2"], right_on=["key_1", "key_2"]) + LEFT OUTER JOIN ~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8c8a16af6bd34..448ceffdaa1eb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -948,6 +948,7 @@ Datetimelike - Bug in :class:`Timestamp` constructor failing to raise when given a ``np.datetime64`` object with non-standard unit (:issue:`25611`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`) +- Bug in :func:`to_datetime` where passing an ``lxml.etree._ElementUnicodeResult`` together with ``format`` raised ``TypeError``. Now subclasses of ``str`` are handled. (:issue:`60933`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) - Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`) - Bug in :meth:`DataFrame.fillna` raising an ``AssertionError`` instead of ``OutOfBoundsDatetime`` when filling a ``datetime64[ns]`` column with an out-of-bounds timestamp. Now correctly raises ``OutOfBoundsDatetime``. (:issue:`61208`) @@ -972,6 +973,7 @@ Datetimelike - Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) + Timedelta ^^^^^^^^^ - Accuracy improvement in :meth:`Timedelta.to_pytimedelta` to round microseconds consistently for large nanosecond based Timedelta (:issue:`57841`) @@ -1008,8 +1010,8 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.str.zfill` raising ``AttributeError`` for :class:`ArrowDtype` (:issue:`61485`) - Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`) -- Interval ^^^^^^^^ @@ -1079,6 +1081,8 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) +- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`) +- Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`) - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) diff --git a/pandas/_config/config.py b/pandas/_config/config.py index f42e69a786d9f..a78a50138d195 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -271,7 +271,7 @@ def set_option(*args) -> None: if not nargs or nargs % 2 != 0: raise ValueError("Must provide an even number of non-keyword arguments") - for k, v in zip(args[::2], args[1::2]): + for k, v in zip(args[::2], args[1::2], strict=True): key = _get_single_key(k) opt = _get_registered_option(key) @@ -502,7 +502,7 @@ def option_context(*args) -> Generator[None]: "option_context(pat, val, pat, val...)." ) - ops = tuple(zip(args[::2], args[1::2])) + ops = tuple(zip(args[::2], args[1::2], strict=True)) try: undo = tuple((pat, get_option(pat)) for pat, val in ops) for pat, val in ops: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 91eddc3261164..442891949dfd2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -29,6 +29,7 @@ from cpython.exc cimport ( PyErr_Fetch, PyErr_Occurred, ) +from cpython.long cimport PyLong_FromString from cpython.object cimport PyObject from cpython.ref cimport ( Py_INCREF, @@ -1069,6 +1070,10 @@ cdef class TextReader: else: col_res = None for dt in self.dtype_cast_order: + if (dt.kind in "iu" and + self._column_has_float(i, start, end, na_filter, na_hashset)): + continue + try: col_res, na_count = self._convert_with_dtype( dt, i, start, end, na_filter, 0, na_hashset, na_fset) @@ -1081,9 +1086,13 @@ cdef class TextReader: np.dtype("object"), i, start, end, 0, 0, na_hashset, na_fset) except OverflowError: - col_res, na_count = self._convert_with_dtype( - np.dtype("object"), i, start, end, na_filter, - 0, na_hashset, na_fset) + try: + col_res, na_count = _try_pylong(self.parser, i, start, + end, na_filter, na_hashset) + except ValueError: + col_res, na_count = self._convert_with_dtype( + np.dtype("object"), i, start, end, 0, + 0, na_hashset, na_fset) if col_res is not None: break @@ -1342,6 +1351,58 @@ cdef class TextReader: else: return None + cdef bint _column_has_float(self, Py_ssize_t col, + int64_t start, int64_t end, + bint na_filter, kh_str_starts_t *na_hashset): + """Check if the column contains any float number.""" + cdef: + Py_ssize_t i, j, lines = end - start + coliter_t it + const char *word = NULL + const char *ignored_chars = " +-" + const char *digits = "0123456789" + const char *float_indicating_chars = "eE" + char null_byte = 0 + + coliter_setup(&it, self.parser, col, start) + + for i in range(lines): + COLITER_NEXT(it, word) + + if na_filter and kh_get_str_starts_item(na_hashset, word): + continue + + found_first_digit = False + j = 0 + while word[j] != null_byte: + if word[j] == self.parser.decimal: + return True + elif not found_first_digit and word[j] in ignored_chars: + # no-op + pass + elif not found_first_digit and word[j] not in digits: + # word isn't numeric + return False + elif not found_first_digit and word[j] in digits: + found_first_digit = True + elif word[j] in float_indicating_chars: + # preceding chars indicates numeric and + # current char indicates float + return True + elif word[j] not in digits: + # previous characters indicates numeric + # current character shows otherwise + return False + elif word[j] in digits: + # no-op + pass + else: + raise AssertionError( + f"Unhandled case {word[j]=} {found_first_digit=}" + ) + j += 1 + + return False # Factor out code common to TextReader.__dealloc__ and TextReader.close # It cannot be a class method, since calling self.close() in __dealloc__ @@ -1873,6 +1934,36 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col, return 0 +cdef _try_pylong(parser_t *parser, Py_ssize_t col, + int64_t line_start, int64_t line_end, + bint na_filter, kh_str_starts_t *na_hashset): + cdef: + int na_count = 0 + Py_ssize_t lines + coliter_t it + const char *word = NULL + ndarray[object] result + object NA = na_values[np.object_] + + lines = line_end - line_start + result = np.empty(lines, dtype=object) + coliter_setup(&it, parser, col, line_start) + + for i in range(lines): + COLITER_NEXT(it, word) + if na_filter and kh_get_str_starts_item(na_hashset, word): + # in the hash table + na_count += 1 + result[i] = NA + continue + + py_int = PyLong_FromString(word, NULL, 10) + if py_int is None: + raise ValueError("Invalid integer ", word) + result[i] = py_int + + return result, na_count + # -> tuple[ndarray[bool], int] cdef _try_bool_flex(parser_t *parser, int64_t col, diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index dfc1fd0fe5630..a44d819c7899a 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -5188,6 +5188,27 @@ INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" _offset_map = {} +deprec_to_valid_alias = { + "H": "h", + "BH": "bh", + "CBH": "cbh", + "T": "min", + "S": "s", + "L": "ms", + "U": "us", + "N": "ns", +} + + +def raise_invalid_freq(freq: str, extra_message: str | None = None) -> None: + msg = f"Invalid frequency: {freq}." + if extra_message is not None: + msg += f" {extra_message}" + if freq in deprec_to_valid_alias: + msg += f" Did you mean {deprec_to_valid_alias[freq]}?" + raise ValueError(msg) + + def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str: if name in _lite_rule_alias: return name @@ -5236,7 +5257,7 @@ def _validate_to_offset_alias(alias: str, is_period: bool) -> None: if (alias.upper() != alias and alias.lower() not in {"s", "ms", "us", "ns"} and alias.upper().split("-")[0].endswith(("S", "E"))): - raise ValueError(INVALID_FREQ_ERR_MSG.format(alias)) + raise ValueError(raise_invalid_freq(freq=alias)) if ( is_period and alias in c_OFFSET_TO_PERIOD_FREQSTR and @@ -5267,8 +5288,9 @@ def _get_offset(name: str) -> BaseOffset: offset = klass._from_name(*split[1:]) except (ValueError, TypeError, KeyError) as err: # bad prefix or suffix - raise ValueError(INVALID_FREQ_ERR_MSG.format( - f"{name}, failed to parse with error message: {repr(err)}") + raise_invalid_freq( + freq=name, + extra_message=f"Failed to parse with error message: {repr(err)}." ) # cache _offset_map[name] = offset @@ -5399,9 +5421,10 @@ cpdef to_offset(freq, bint is_period=False): else: result = result + offset except (ValueError, TypeError) as err: - raise ValueError(INVALID_FREQ_ERR_MSG.format( - f"{freq}, failed to parse with error message: {repr(err)}") - ) from err + raise_invalid_freq( + freq=freq, + extra_message=f"Failed to parse with error message: {repr(err)}" + ) # TODO(3.0?) once deprecation of "d" is enforced, the check for it here # can be removed @@ -5417,7 +5440,7 @@ cpdef to_offset(freq, bint is_period=False): result = None if result is None: - raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) + raise_invalid_freq(freq=freq) try: has_period_dtype_code = hasattr(result, "_period_dtype_code") diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index b443aa7bede22..025cd6c04cb69 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -405,6 +405,11 @@ def array_strptime( if len(val) == 0 or val in nat_strings: iresult[i] = NPY_NAT continue + elif type(val) is not str: + # GH#60933: normalize string subclasses + # (e.g. lxml.etree._ElementUnicodeResult). The downstream Cython + # path expects an exact `str`, so ensure we pass a plain str + val = str(val) elif checknull_with_nat_and_na(val): iresult[i] = NPY_NAT continue diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index c13b0c4cd78a5..f5efd6922d6d7 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -2068,6 +2068,9 @@ class Timedelta(_Timedelta): disallow_ambiguous_unit(unit) + cdef: + int64_t new_value + # GH 30543 if pd.Timedelta already passed, return it # check that only value is passed if isinstance(value, _Timedelta): diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b8dd44a58e8ec..d4f0e4681405b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -569,6 +569,7 @@ def _box_pa_array( ------- pa.Array or pa.ChunkedArray """ + value = extract_array(value, extract_numpy=True) if isinstance(value, cls): pa_array = value._pa_array elif isinstance(value, (pa.Array, pa.ChunkedArray)): @@ -887,7 +888,7 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: boxed = self._box_pa(other) except pa.lib.ArrowInvalid: # e.g. GH#60228 [1, "b"] we have to operate pointwise - res_values = [op(x, y) for x, y in zip(self, other)] + res_values = [op(x, y) for x, y in zip(self, other, strict=True)] result = pa.array(res_values, type=pa.bool_(), from_pandas=True) else: rtype = boxed.type @@ -1051,7 +1052,7 @@ def _arith_method(self, other, op) -> Self | npt.NDArray[np.object_]: result = self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) if is_nan_na() and result.dtype.kind == "f": parr = result._pa_array - mask = pc.is_nan(parr).to_numpy() + mask = pc.is_nan(parr).fill_null(False).to_numpy() arr = pc.replace_with_mask(parr, mask, pa.scalar(None, type=parr.type)) result = type(self)(arr) return result @@ -2712,7 +2713,7 @@ def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): if expand: return { col: self._from_pyarrow_array(pc.struct_field(result, [i])) - for col, i in zip(groups, range(result.type.num_fields)) + for col, i in zip(groups, range(result.type.num_fields), strict=True) } else: return type(self)(pc.struct_field(result, [0])) @@ -2807,6 +2808,13 @@ def _str_wrap(self, width: int, **kwargs) -> Self: result = self._apply_elementwise(predicate) return self._from_pyarrow_array(pa.chunked_array(result)) + def _str_zfill(self, width: int) -> Self: + # TODO: Replace with pc.utf8_zfill when supported by arrow + # Arrow ENH - https://github.com/apache/arrow/issues/46683 + predicate = lambda val: val.zfill(width) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + @property def _dt_days(self) -> Self: return self._from_pyarrow_array( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index fcd7611b3e6b5..e8ca51ef92a94 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2869,7 +2869,7 @@ def convert_values(param): # If the operator is not defined for the underlying objects, # a TypeError should be raised - res = [op(a, b) for (a, b) in zip(lvalues, rvalues)] + res = [op(a, b) for (a, b) in zip(lvalues, rvalues, strict=True)] def _maybe_convert(arr): if coerce_to_dtype: @@ -2885,7 +2885,7 @@ def _maybe_convert(arr): return res if op.__name__ in {"divmod", "rdivmod"}: - a, b = zip(*res) + a, b = zip(*res, strict=True) return _maybe_convert(a), _maybe_convert(b) return _maybe_convert(res) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6b998fbcfc1a0..d59499ed99c75 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2,6 +2,7 @@ from csv import QUOTE_NONNUMERIC from functools import partial +import itertools import operator from shutil import get_terminal_size from typing import ( @@ -1585,6 +1586,22 @@ def map( >>> cat.map({"a": "first", "b": "second"}, na_action=None) Index(['first', 'second', nan], dtype='str') + + The mapping function is applied to categories, not to each value. It is + therefore only called once per unique category, and the result reused for + all occurrences: + + >>> cat = pd.Categorical(["a", "a", "b"]) + >>> calls = [] + >>> def f(x): + ... calls.append(x) + ... return x.upper() + >>> result = cat.map(f) + >>> result + ['A', 'A', 'B'] + Categories (2, str): ['A', 'B'] + >>> calls + ['a', 'b'] """ assert callable(mapper) or is_dict_like(mapper) @@ -2413,8 +2430,8 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: ensure_platform_int(self.codes), categories.size ) counts = ensure_int64(counts).cumsum() - _result = (r[start:end] for start, end in zip(counts, counts[1:])) - return dict(zip(categories, _result)) + _result = (r[start:end] for start, end in itertools.pairwise(counts)) + return dict(zip(categories, _result, strict=True)) # ------------------------------------------------------------------ # Reductions @@ -3149,5 +3166,8 @@ def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]: # For consistency, it should return two empty lists. return [], [] - codes, categories = zip(*(factorize_from_iterable(it) for it in iterables)) + codes, categories = zip( + *(factorize_from_iterable(it) for it in iterables), + strict=True, + ) return list(codes), list(categories) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c68b329b00968..7f5661f224348 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2374,7 +2374,7 @@ def _concat_same_type( to_concat = [x for x in to_concat if len(x)] if obj.freq is not None and all(x.freq == obj.freq for x in to_concat): - pairs = zip(to_concat[:-1], to_concat[1:]) + pairs = zip(to_concat[:-1], to_concat[1:], strict=True) if all(pair[0][-1] + obj.freq == pair[1][0] for pair in pairs): new_freq = obj.freq new_obj._freq = new_freq diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 8d13e76c57e4f..ace868bda52d3 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -39,7 +39,6 @@ ) from pandas.compat.numpy import function as nv from pandas.errors import IntCastingNaNError -from pandas.util._decorators import Appender from pandas.core.dtypes.cast import ( LossySetitemError, @@ -79,7 +78,6 @@ from pandas.core.arrays import ArrowExtensionArray from pandas.core.arrays.base import ( ExtensionArray, - _extension_array_shared_docs, ) from pandas.core.arrays.datetimes import DatetimeArray from pandas.core.arrays.timedeltas import TimedeltaArray @@ -178,21 +176,63 @@ """ -@Appender( - _interval_shared_docs["class"] - % { - "klass": "IntervalArray", - "summary": "Pandas array for interval data that are closed on the same side.", - "name": "", - "extra_attributes": "", - "extra_methods": "", - "examples": textwrap.dedent( - """\ +class IntervalArray(IntervalMixin, ExtensionArray): + """ + Pandas array for interval data that are closed on the same side. + + Parameters + ---------- + data : array-like (1-dimensional) + Array-like (ndarray, :class:`DateTimeArray`, :class:`TimeDeltaArray`) containing + Interval objects from which to build the IntervalArray. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both or + neither. + dtype : dtype or None, default None + If None, dtype will be inferred. + copy : bool, default False + Copy the input data. + verify_integrity : bool, default True + Verify that the IntervalArray is valid. + + Attributes + ---------- + left + right + closed + mid + length + is_empty + is_non_overlapping_monotonic + + Methods + ------- + from_arrays + from_tuples + from_breaks + contains + overlaps + set_closed + to_tuples + + See Also + -------- + Index : The base pandas Index type. + Interval : A bounded slice-like interval; the elements of an IntervalArray. + interval_range : Function to create a fixed frequency IntervalIndex. + cut : Bin values into discrete Intervals. + qcut : Bin values into equal-sized Intervals based on rank or sample quantiles. + + Notes + ----- + See the `user guide + `__ + for more. + Examples -------- A new ``IntervalArray`` can be constructed directly from an array-like of ``Interval`` objects: - >>> pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) [(0, 1], (1, 5]] @@ -202,10 +242,7 @@ methods: :meth:`IntervalArray.from_arrays`, :meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`. """ - ), - } -) -class IntervalArray(IntervalMixin, ExtensionArray): + can_hold_na = True _na_value = _fill_value = np.nan @@ -429,23 +466,6 @@ def _from_factorized(cls, values: np.ndarray, original: IntervalArray) -> Self: ) @classmethod - @Appender( - _interval_shared_docs["from_breaks"] - % { - "klass": "IntervalArray", - "name": "", - "examples": textwrap.dedent( - """\ - Examples - -------- - >>> pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3]) - - [(0, 1], (1, 2], (2, 3]] - Length: 3, dtype: interval[int64, right] - """ - ), - } - ) def from_breaks( cls, breaks, @@ -453,6 +473,39 @@ def from_breaks( copy: bool = False, dtype: Dtype | None = None, ) -> Self: + """ + Construct an IntervalArray from an array of splits. + + Parameters + ---------- + breaks : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : bool, default False + Copy the data. + dtype : dtype or None, default None + If None, dtype will be inferred. + + Returns + ------- + IntervalArray + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + IntervalArray.from_arrays : Construct from a left and right array. + IntervalArray.from_tuples : Construct from a sequence of tuples. + + Examples + -------- + >>> pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3]) + + [(0, 1], (1, 2], (2, 3]] + Length: 3, dtype: interval[int64, right] + """ + breaks = _maybe_convert_platform_interval(breaks) return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, dtype=dtype) @@ -508,23 +561,6 @@ def from_breaks( ) @classmethod - @Appender( - _interval_shared_docs["from_arrays"] - % { - "klass": "IntervalArray", - "name": "", - "examples": textwrap.dedent( - """\ - Examples - -------- - >>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3]) - - [(0, 1], (1, 2], (2, 3]] - Length: 3, dtype: interval[int64, right] - """ - ), - } - ) def from_arrays( cls, left, @@ -533,6 +569,57 @@ def from_arrays( copy: bool = False, dtype: Dtype | None = None, ) -> Self: + """ + Construct from two arrays defining the left and right bounds. + + Parameters + ---------- + left : array-like (1-dimensional) + Left bounds for each interval. + right : array-like (1-dimensional) + Right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : bool, default False + Copy the data. + dtype : dtype, optional + If None, dtype will be inferred. + + Returns + ------- + IntervalArray + + Raises + ------ + ValueError + When a value is missing in only one of `left` or `right`. + When a value in `left` is greater than the corresponding value + in `right`. + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + IntervalArray.from_breaks : Construct an IntervalArray from an array of + splits. + IntervalArray.from_tuples : Construct an IntervalArray from an + array-like of tuples. + + Notes + ----- + Each element of `left` must be less than or equal to the `right` + element at the same position. If an element is missing, it must be + missing in both `left` and `right`. A TypeError is raised when + using an unsupported type for `left` or `right`. At the moment, + 'category', 'object', and 'string' subtypes are not supported. + + Examples + -------- + >>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3]) + + [(0, 1], (1, 2], (2, 3]] + Length: 3, dtype: interval[int64, right] + """ left = _maybe_convert_platform_interval(left) right = _maybe_convert_platform_interval(right) @@ -581,23 +668,6 @@ def from_arrays( ) @classmethod - @Appender( - _interval_shared_docs["from_tuples"] - % { - "klass": "IntervalArray", - "name": "", - "examples": textwrap.dedent( - """\ - Examples - -------- - >>> pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) - - [(0, 1], (1, 2]] - Length: 2, dtype: interval[int64, right] - """ - ), - } - ) def from_tuples( cls, data, @@ -605,6 +675,40 @@ def from_tuples( copy: bool = False, dtype: Dtype | None = None, ) -> Self: + """ + Construct an IntervalArray from an array-like of tuples. + + Parameters + ---------- + data : array-like (1-dimensional) + Array of tuples. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : bool, default False + By-default copy the data, this is compat only and ignored. + dtype : dtype or None, default None + If None, dtype will be inferred. + + Returns + ------- + IntervalArray + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + IntervalArray.from_arrays : Construct an IntervalArray from a left and + right array. + IntervalArray.from_breaks : Construct an IntervalArray from an array of + splits. + + Examples + -------- + >>> pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) + + [(0, 1], (1, 2]] + Length: 2, dtype: interval[int64, right] + """ if len(data): left, right = [], [] else: @@ -1381,23 +1485,50 @@ def mid(self) -> Index: """ ) - @Appender( - _interval_shared_docs["overlaps"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ + def overlaps(self, other): + """ + Check elementwise if an Interval overlaps the values in the IntervalArray. + + Two intervals overlap if they share a common point, including closed + endpoints. Intervals that only have an open endpoint in common do not + overlap. + + Parameters + ---------- + other : IntervalArray + Interval to check against for an overlap. + + Returns + ------- + ndarray + Boolean array positionally indicating where an overlap occurs. + + See Also + -------- + Interval.overlaps : Check whether two Interval objects overlap. + + Examples + -------- >>> data = [(0, 1), (1, 3), (2, 4)] >>> intervals = pd.arrays.IntervalArray.from_tuples(data) >>> intervals [(0, 1], (1, 3], (2, 4]] Length: 3, dtype: interval[int64, right] + + >>> intervals.overlaps(pd.Interval(0.5, 1.5)) + array([ True, True, False]) + + Intervals that share closed endpoints overlap: + + >>> intervals.overlaps(pd.Interval(1, 3, closed="left")) + array([ True, True, True]) + + Intervals that only have an open endpoint in common do not overlap: + + >>> intervals.overlaps(pd.Interval(1, 2, closed="right")) + array([False, True, False]) """ - ), - } - ) - def overlaps(self, other): if isinstance(other, (IntervalArray, ABCIntervalIndex)): raise NotImplementedError if not isinstance(other, Interval): @@ -1762,7 +1893,7 @@ def to_tuples(self, na_tuple: bool = True) -> np.ndarray: >>> idx.to_tuples() Index([(0, 1), (1, 2)], dtype='object') """ - tuples = com.asarray_tuplesafe(zip(self._left, self._right)) + tuples = com.asarray_tuplesafe(zip(self._left, self._right, strict=True)) if not na_tuple: # GH 18756 tuples = np.where(~self.isna(), tuples, np.nan) @@ -1817,12 +1948,52 @@ def delete(self, loc) -> Self: new_right = self._right.delete(loc) return self._shallow_copy(left=new_left, right=new_right) - @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) def repeat( self, repeats: int | Sequence[int], axis: AxisInt | None = None, ) -> Self: + """ + Repeat elements of a IntervalArray. + + Returns a new IntervalArray where each element of the current IntervalArray + is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int or array of ints + The number of repetitions for each element. This should be a + non-negative integer. Repeating 0 times will return an empty + IntervalArray. + axis : None + Must be ``None``. Has no effect but is accepted for compatibility + with numpy. + + Returns + ------- + IntervalArray + Newly created IntervalArray with repeated elements. + + See Also + -------- + Series.repeat : Equivalent function for Series. + Index.repeat : Equivalent function for Index. + numpy.repeat : Similar method for :class:`numpy.ndarray`. + ExtensionArray.take : Take arbitrary positions. + + Examples + -------- + >>> cat = pd.Categorical(["a", "b", "c"]) + >>> cat + ['a', 'b', 'c'] + Categories (3, str): ['a', 'b', 'c'] + >>> cat.repeat(2) + ['a', 'a', 'b', 'b', 'c', 'c'] + Categories (3, str): ['a', 'b', 'c'] + >>> cat.repeat([1, 2, 3]) + ['a', 'b', 'b', 'c', 'c', 'c'] + Categories (3, str): ['a', 'b', 'c'] + """ nv.validate_repeat((), {"axis": axis}) left_repeat = self.left.repeat(repeats) right_repeat = self.right.repeat(repeats) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d20dc87259a37..57efde1a928bc 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -344,7 +344,7 @@ def __iter__(self) -> Iterator: yield val else: na_value = self.dtype.na_value - for isna_, val in zip(self._mask, self._data): + for isna_, val in zip(self._mask, self._data, strict=True): if isna_: yield na_value else: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 180080da4cd00..adc09afab485b 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -1445,7 +1445,7 @@ def _range_from_fields( freqstr = freq.freqstr year, quarter = _make_field_arrays(year, quarter) - for y, q in zip(year, quarter): + for y, q in zip(year, quarter, strict=True): calendar_year, calendar_month = parsing.quarter_to_myear(y, q, freqstr) val = libperiod.period_ordinal( calendar_year, calendar_month, 1, 1, 1, 1, 0, 0, base @@ -1455,7 +1455,7 @@ def _range_from_fields( freq = to_offset(freq, is_period=True) base = libperiod.freq_to_dtype_code(freq) arrays = _make_field_arrays(year, month, day, hour, minute, second) - for y, mth, d, h, mn, s in zip(*arrays): + for y, mth, d, h, mn, s in zip(*arrays, strict=True): ordinals.append(libperiod.period_ordinal(y, mth, d, h, mn, s, 0, 0, base)) return np.array(ordinals, dtype=np.int64), freq diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c04f3716f4739..e6ff67af78700 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1753,7 +1753,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): self._simple_new( sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv) ) - for sp_value, fv in zip(sp_values, fill_value) + for sp_value, fv in zip(sp_values, fill_value, strict=True) ) return arrays elif method == "reduce": diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index b38eaa4072796..95abd9a953e24 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -172,7 +172,7 @@ def __init__( warnings.warn( "The 'pyarrow_numpy' storage option name is deprecated and will be " 'removed in pandas 3.0. Use \'pd.StringDtype(storage="pyarrow", ' - "na_value-np.nan)' to construct the same dtype.\nOr enable the " + "na_value=np.nan)' to construct the same dtype.\nOr enable the " "'pd.options.future.infer_string = True' option globally and use " 'the "str" alias as a shorthand notation to specify a dtype ' '(instead of "string[pyarrow_numpy]").', @@ -1134,3 +1134,6 @@ def _cmp_method(self, other, op): return res_arr _arith_method = _cmp_method + + def _str_zfill(self, width: int) -> Self: + return self._str_map(lambda x: x.zfill(width)) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 64c2e1779aba7..1942212cd97b8 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -621,7 +621,9 @@ def __truediv__(self, other): if is_object_dtype(other.dtype): other = np.asarray(other) if self.ndim > 1: - res_cols = [left / right for left, right in zip(self, other)] + res_cols = [ + left / right for left, right in zip(self, other, strict=True) + ] res_cols2 = [x.reshape(1, -1) for x in res_cols] result = np.concatenate(res_cols2, axis=0) else: @@ -670,7 +672,9 @@ def __floordiv__(self, other): elif is_object_dtype(other.dtype): other = np.asarray(other) if self.ndim > 1: - res_cols = [left // right for left, right in zip(self, other)] + res_cols = [ + left // right for left, right in zip(self, other, strict=True) + ] res_cols2 = [x.reshape(1, -1) for x in res_cols] result = np.concatenate(res_cols2, axis=0) else: diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 408c2858aa876..1dd4ed7100ccf 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -514,7 +514,7 @@ def _array_equivalent_object( left_remaining = left right_remaining = right - for left_value, right_value in zip(left_remaining, right_remaining): + for left_value, right_value in zip(left_remaining, right_remaining, strict=True): if left_value is NaT and right_value is not NaT: return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 72c9fe51be7f3..643974db5f2bf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1472,8 +1472,57 @@ def style(self) -> Styler: Name: population, dtype: int64 """ - @Appender(_shared_docs["items"]) def items(self) -> Iterable[tuple[Hashable, Series]]: + r""" + Iterate over (column name, Series) pairs. + + Iterates over the DataFrame columns, returning a tuple with + the column name and the content as a Series. + + Yields + ------ + label : object + The column names for the DataFrame being iterated over. + content : Series + The column entries belonging to each label, as a Series. + + See Also + -------- + DataFrame.iterrows : Iterate over DataFrame rows as + (index, Series) pairs. + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples + of the values. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "species": ["bear", "bear", "marsupial"], + ... "population": [1864, 22000, 80000], + ... }, + ... index=["panda", "polar", "koala"], + ... ) + >>> df + species population + panda bear 1864 + polar bear 22000 + koala marsupial 80000 + >>> for label, content in df.items(): + ... print(f"label: {label}") + ... print(f"content: {content}", sep="\n") + label: species + content: + panda bear + polar bear + koala marsupial + Name: species, dtype: object + label: population + content: + panda 1864 + polar 22000 + koala 80000 + Name: population, dtype: int64 + """ for i, k in enumerate(self.columns): yield k, self._ixs(i, axis=1) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 66629966a8254..fe7bf5bbc4c2c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4325,7 +4325,7 @@ def nth(self) -> GroupByNthSelector: def _nth( self, n: PositionalIndexer | tuple, - dropna: Literal["any", "all", None] = None, + dropna: Literal["any", "all"] | None = None, ) -> NDFrameT: if not dropna: mask = self._make_mask_from_positional_indexer(n) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index c658f625d5ea9..dc9cf9bcd1e64 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -296,7 +296,7 @@ def __init__(self, groupby_object: groupby.GroupBy) -> None: def __call__( self, n: PositionalIndexer | tuple, - dropna: Literal["any", "all", None] = None, + dropna: Literal["any", "all"] | None = None, ) -> DataFrame | Series: return self.groupby_object._nth(n, dropna) diff --git a/pandas/core/indexers/utils.py b/pandas/core/indexers/utils.py index b089be3469d87..cfc49b8083267 100644 --- a/pandas/core/indexers/utils.py +++ b/pandas/core/indexers/utils.py @@ -434,10 +434,11 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: ---------- array : array-like The array that is being indexed (only used for the length). - indexer : array-like or list-like - The array-like that's used to index. List-like input that is not yet - a numpy array or an ExtensionArray is converted to one. Other input - types are passed through as is. + indexer : array-like, list-like, int, slice, or other indexer + The indexer used for indexing. Array-like and list-like inputs that + are not yet a numpy array or an ExtensionArray are converted to one. + Non-array indexers (int, slice, Ellipsis, tuples, etc.) are passed + through as is. Returns ------- @@ -486,6 +487,13 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: >>> pd.api.indexers.check_array_indexer(arr, mask) array([ True, False]) + Integer and slice indexers are passed through as is: + + >>> pd.api.indexers.check_array_indexer(arr, 1) + 1 + >>> pd.api.indexers.check_array_indexer(arr, slice(0, 1, 1)) + slice(0, 1, 1) + Similarly for integer indexers, an integer ndarray is returned when it is a valid indexer, otherwise an error is (for integer indexers, a matching length is not required): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7d18c85c98bbe..7054c2e23e1ed 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -40,6 +40,7 @@ IgnoreRaise, IndexLabel, IndexT, + NaPosition, Scalar, Shape, npt, @@ -2398,8 +2399,54 @@ def append(self, other): return Index(new_tuples) def argsort( - self, *args, na_position: str = "last", **kwargs + self, *args, na_position: NaPosition = "last", **kwargs ) -> npt.NDArray[np.intp]: + """ + Return the integer indices that would sort the index. + + Parameters + ---------- + *args + Passed to `numpy.ndarray.argsort`. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + **kwargs + Passed to `numpy.ndarray.argsort`. + + Returns + ------- + np.ndarray[np.intp] + Integer indices that would sort the index if used as + an indexer. + + See Also + -------- + numpy.argsort : Similar method for NumPy arrays. + Index.argsort : Similar method for Index. + + Examples + -------- + >>> midx = pd.MultiIndex.from_arrays([[3, 2], ["e", "c"]]) + >>> midx + MultiIndex([(3, 'e'), (2, 'c')]) + + >>> order = midx.argsort() + >>> order + array([1, 0]) + + >>> midx[order] + MultiIndex([(2, 'c'), + (3, 'e')], + ) + + >>> midx = pd.MultiIndex.from_arrays([[2, 2], [np.nan, 0]]) + >>> midx.argsort(na_position="first") + array([0, 1]) + + >>> midx.argsort() + array([1, 0]) + """ target = self._sort_levels_monotonic(raise_if_incomparable=True) keys = [lev.codes for lev in target._get_codes_for_sorting()] return lexsort_indexer(keys, na_position=na_position, codes_given=True) diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index bbeaa98178b68..d92540af168c0 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -199,6 +199,9 @@ def _select_data(self) -> DataFrame: include=self.include, exclude=self.exclude, ) + if len(data.columns) == 0: + msg = "No columns match the specified include or exclude data types" + raise ValueError(msg) return data diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index aea95e4684573..75003675dc173 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -198,7 +198,9 @@ def to_dict( if i in object_dtype_indices_as_set else list(map(maybe_box_native, v.to_numpy())), ) - for i, (box_na_value, (k, v)) in enumerate(zip(box_na_values, df.items())) + for i, (box_na_value, (k, v)) in enumerate( + zip(box_na_values, df.items(), strict=True) + ) ) elif orient == "split": @@ -235,12 +237,13 @@ def to_dict( columns = df.columns.tolist() if are_all_object_dtype_cols: return [ - into_c(zip(columns, map(maybe_box_native, row))) + into_c(zip(columns, map(maybe_box_native, row), strict=True)) for row in df.itertuples(index=False, name=None) ] else: data = [ - into_c(zip(columns, t)) for t in df.itertuples(index=False, name=None) + into_c(zip(columns, t, strict=True)) + for t in df.itertuples(index=False, name=None) ] if box_native_indices: object_dtype_indices_as_set = set(box_native_indices) @@ -260,7 +263,7 @@ def to_dict( columns = df.columns.tolist() if are_all_object_dtype_cols: return into_c( - (t[0], dict(zip(df.columns, map(maybe_box_native, t[1:])))) + (t[0], dict(zip(df.columns, map(maybe_box_native, t[1:]), strict=True))) for t in df.itertuples(name=None) ) elif box_native_indices: @@ -272,14 +275,17 @@ def to_dict( column: maybe_box_native(v) if i in object_dtype_indices_as_set else v - for i, (column, v) in enumerate(zip(columns, t[1:])) + for i, (column, v) in enumerate( + zip(columns, t[1:], strict=True) + ) }, ) for t in df.itertuples(name=None) ) else: return into_c( - (t[0], dict(zip(columns, t[1:]))) for t in df.itertuples(name=None) + (t[0], dict(zip(columns, t[1:], strict=True))) + for t in df.itertuples(name=None) ) else: diff --git a/pandas/core/series.py b/pandas/core/series.py index 3a7b0614c63ec..a5c3bb8d51e8a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4419,6 +4419,34 @@ def map( 2 NaN 3 I am a rabbit dtype: object + + For categorical data, the function is only applied to the categories: + + >>> s = pd.Series(list("cabaa")) + >>> s.map(print) + c + a + b + a + a + 0 None + 1 None + 2 None + 3 None + 4 None + dtype: object + + >>> s_cat = s.astype("category") + >>> s_cat.map(print) # function called once per unique category + a + b + c + 0 None + 1 None + 2 None + 3 None + 4 None + dtype: object """ if func is None: if "arg" in kwargs: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index b78ea3a9bf883..ff3a17e4d2d5b 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -334,7 +334,7 @@ def _wrap_result( ) result = { label: ArrowExtensionArray(pa.array(res)) - for label, res in zip(name, result.T) + for label, res in zip(name, result.T, strict=True) } elif is_object_dtype(result): @@ -684,7 +684,8 @@ def cat( elif na_rep is not None and union_mask.any(): # fill NaNs with na_rep in case there are actually any NaNs all_cols = [ - np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) + np.where(nm, na_rep, col) + for nm, col in zip(na_masks, all_cols, strict=True) ] result = cat_safe(all_cols, sep) else: @@ -1912,8 +1913,8 @@ def zfill(self, width: int): if not is_integer(width): msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - f = lambda x: x.zfill(width) - result = self._data.array._str_map(f) + + result = self._data.array._str_zfill(width) return self._wrap_result(result) def slice(self, start=None, stop=None, step=None): diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index ba35542b7f112..2afa4eb8efb76 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -544,3 +544,6 @@ def f(x): return empty_row return [f(val) for val in np.asarray(self)] + + def _str_zfill(self, width: int): + return self._str_map(lambda x: x.zfill(width)) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 2ed241f0b9bca..9a562481f0e98 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -7,6 +7,7 @@ import warnings from pandas._libs import lib +from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -21,6 +22,7 @@ from pandas._typing import DtypeBackend +@set_module("pandas") def read_clipboard( sep: str = r"\s+", dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 84f5cc447513c..48028c54a1773 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -39,6 +39,7 @@ from pandas.util._decorators import ( Appender, doc, + set_module, ) from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -434,6 +435,7 @@ def read_excel( ) -> dict[IntStrT, DataFrame]: ... +@set_module("pandas") @doc(storage_options=_shared_docs["storage_options"]) @Appender(_read_excel_doc) def read_excel( @@ -951,6 +953,7 @@ def _parse_sheet( return output +@set_module("pandas") @doc(storage_options=_shared_docs["storage_options"]) class ExcelWriter(Generic[_WorkbookT]): """ @@ -1471,6 +1474,7 @@ def inspect_excel_format( return "zip" +@set_module("pandas") @doc(storage_options=_shared_docs["storage_options"]) class ExcelFile: """ diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index ebb678c26db30..19f1e41f5b22f 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -13,7 +13,10 @@ from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import Pandas4Warning -from pandas.util._decorators import doc +from pandas.util._decorators import ( + doc, + set_module, +) from pandas.util._validators import check_dtype_backend from pandas.core.api import DataFrame @@ -68,6 +71,7 @@ def to_feather( feather.write_feather(df, handles.handle, **kwargs) +@set_module("pandas") @doc(storage_options=_shared_docs["storage_options"]) def read_feather( path: FilePath | ReadBuffer[bytes], diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 7de26249762bb..d72b6cd89b940 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -42,6 +42,7 @@ Timestamp, ) from pandas._libs.tslibs.nattype import NaTType +from pandas.util._decorators import set_module from pandas.core.dtypes.common import ( is_complex_dtype, @@ -1955,6 +1956,7 @@ def __call__(self, num: float) -> str: return formatted +@set_module("pandas") def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> None: """ Format float representation in DataFrame with SI notation. diff --git a/pandas/io/html.py b/pandas/io/html.py index 183af3a03221b..70f29fd4e876d 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -24,7 +24,10 @@ AbstractMethodError, EmptyDataError, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + doc, + set_module, +) from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -220,7 +223,7 @@ def __init__( attrs: dict[str, str] | None, encoding: str, displayed_only: bool, - extract_links: Literal[None, "header", "footer", "body", "all"], + extract_links: Literal["header", "footer", "body", "all"] | None, storage_options: StorageOptions = None, ) -> None: self.io = io @@ -1024,6 +1027,7 @@ def _parse( return ret +@set_module("pandas") @doc(storage_options=_shared_docs["storage_options"]) def read_html( io: FilePath | ReadBuffer[str], @@ -1042,7 +1046,7 @@ def read_html( na_values: Iterable[object] | None = None, keep_default_na: bool = True, displayed_only: bool = True, - extract_links: Literal[None, "header", "footer", "body", "all"] = None, + extract_links: Literal["header", "footer", "body", "all"] | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions = None, ) -> list[DataFrame]: diff --git a/pandas/io/iceberg.py b/pandas/io/iceberg.py index dcb675271031e..19300384ff845 100644 --- a/pandas/io/iceberg.py +++ b/pandas/io/iceberg.py @@ -3,10 +3,12 @@ ) from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import set_module from pandas import DataFrame +@set_module("pandas") def read_iceberg( table_identifier: str, catalog_name: str | None = None, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 32e932b70e761..481f6a3a0aa61 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -29,7 +29,10 @@ from pandas._libs.tslibs import iNaT from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError -from pandas.util._decorators import doc +from pandas.util._decorators import ( + doc, + set_module, +) from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( @@ -496,6 +499,7 @@ def read_json( ) -> DataFrame: ... +@set_module("pandas") @doc( storage_options=_shared_docs["storage_options"], decompression_options=_shared_docs["decompression_options"] % "path_or_buf", diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 680f4289fdbff..16ec73ddeb743 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -17,6 +17,7 @@ import numpy as np from pandas._libs.writers import convert_json_to_lines +from pandas.util._decorators import set_module import pandas as pd from pandas import ( @@ -266,6 +267,7 @@ def _simple_json_normalize( return normalized_json_object +@set_module("pandas") def json_normalize( data: dict | list[dict] | Series, record_path: str | list | None = None, diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 02e0ec5247e74..84195df91d49c 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -11,6 +11,7 @@ from pandas._libs import lib from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import set_module from pandas.util._validators import check_dtype_backend from pandas.core.indexes.api import default_index @@ -35,6 +36,7 @@ from pandas.core.frame import DataFrame +@set_module("pandas") def read_orc( path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 1a3f8cc046066..09539089b3904 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -21,7 +21,10 @@ AbstractMethodError, Pandas4Warning, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + doc, + set_module, +) from pandas.util._validators import check_dtype_backend from pandas import ( @@ -500,6 +503,7 @@ def to_parquet( return None +@set_module("pandas") @doc(storage_options=_shared_docs["storage_options"]) def read_parquet( path: FilePath | ReadBuffer[bytes], diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index f0441f583bea2..19afa26d88c77 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -10,7 +10,10 @@ import warnings from pandas.compat import pickle_compat -from pandas.util._decorators import doc +from pandas.util._decorators import ( + doc, + set_module, +) from pandas.core.shared_docs import _shared_docs @@ -31,6 +34,7 @@ ) +@set_module("pandas") @doc( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "filepath_or_buffer", @@ -114,6 +118,7 @@ def to_pickle( pickle.dump(obj, handles.handle, protocol=protocol) +@set_module("pandas") @doc( storage_options=_shared_docs["storage_options"], decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer", diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 97e31114ead48..3616a93321358 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -51,7 +51,10 @@ PerformanceWarning, PossibleDataLossError, ) -from pandas.util._decorators import cache_readonly +from pandas.util._decorators import ( + cache_readonly, + set_module, +) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( @@ -312,6 +315,7 @@ def to_hdf( f(store) +@set_module("pandas") def read_hdf( path_or_buf: FilePath | HDFStore, key=None, @@ -488,6 +492,7 @@ def _is_metadata_of(group: Node, parent_group: Node) -> bool: return False +@set_module("pandas") class HDFStore: """ Dict-like IO interface for storing pandas objects in PyTables. diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 1424d43d2a053..46b22310cbfca 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -15,7 +15,10 @@ overload, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + doc, + set_module, +) from pandas.core.shared_docs import _shared_docs @@ -83,6 +86,7 @@ def read_sas( ) -> DataFrame | SASReader: ... +@set_module("pandas") @doc(decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer") def read_sas( filepath_or_buffer: FilePath | ReadBuffer[bytes], diff --git a/pandas/io/spss.py b/pandas/io/spss.py index dfada10c719c9..522c7206a2ae5 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -7,6 +7,7 @@ from pandas._libs import lib from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import set_module from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.inference import is_list_like @@ -22,6 +23,7 @@ from pandas import DataFrame +@set_module("pandas") def read_spss( path: str | Path, usecols: Sequence[str] | None = None, diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 81adeab0e1907..7a8ba2e146bcf 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -40,6 +40,7 @@ AbstractMethodError, DatabaseError, ) +from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -263,6 +264,7 @@ def read_sql_table( ) -> Iterator[DataFrame]: ... +@set_module("pandas") def read_sql_table( table_name: str, con, @@ -394,6 +396,7 @@ def read_sql_query( ) -> Iterator[DataFrame]: ... +@set_module("pandas") def read_sql_query( sql, con, @@ -532,6 +535,7 @@ def read_sql( ) -> Iterator[DataFrame]: ... +@set_module("pandas") def read_sql( sql, con, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 979f00973958b..69205d6bebb65 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -46,6 +46,7 @@ from pandas.util._decorators import ( Appender, doc, + set_module, ) from pandas.util._exceptions import find_stack_level @@ -2133,6 +2134,7 @@ def value_labels(self) -> dict[str, dict[int, str]]: return self._value_label_dict +@set_module("pandas") @Appender(_read_stata_doc) def read_stata( filepath_or_buffer: FilePath | ReadBuffer[bytes], diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 97259c57bbf33..53c0b07f3d90f 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -17,7 +17,10 @@ AbstractMethodError, ParserError, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + doc, + set_module, +) from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -829,6 +832,7 @@ def _parse( ) +@set_module("pandas") @doc( storage_options=_shared_docs["storage_options"], decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index c91903f7bd43e..b46af93c447d4 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -8,11 +8,6 @@ from pandas._config import get_option -from pandas.util._decorators import ( - Appender, - Substitution, -) - from pandas.core.dtypes.common import ( is_integer, is_list_like, @@ -276,235 +271,177 @@ def hist_frame( ) -_boxplot_doc = """ -Make a box plot from DataFrame columns. - -Make a box-and-whisker plot from DataFrame columns, optionally grouped -by some other columns. A box plot is a method for graphically depicting -groups of numerical data through their quartiles. -The box extends from the Q1 to Q3 quartile values of the data, -with a line at the median (Q2). The whiskers extend from the edges -of box to show the range of the data. By default, they extend no more than -`1.5 * IQR (IQR = Q3 - Q1)` from the edges of the box, ending at the farthest -data point within that interval. Outliers are plotted as separate dots. - -For further details see -Wikipedia's entry for `boxplot `_. - -Parameters ----------- -%(data)s\ -column : str or list of str, optional - Column name or list of names, or vector. - Can be any valid input to :meth:`pandas.DataFrame.groupby`. -by : str or array-like, optional - Column in the DataFrame to :meth:`pandas.DataFrame.groupby`. - One box-plot will be done per value of columns in `by`. -ax : object of class matplotlib.axes.Axes, optional - The matplotlib axes to be used by boxplot. -fontsize : float or str - Tick label font size in points or as a string (e.g., `large`). -rot : float, default 0 - The rotation angle of labels (in degrees) - with respect to the screen coordinate system. -grid : bool, default True - Setting this to True will show the grid. -figsize : A tuple (width, height) in inches - The size of the figure to create in matplotlib. -layout : tuple (rows, columns), optional - For example, (3, 5) will display the subplots - using 3 rows and 5 columns, starting from the top-left. -return_type : {'axes', 'dict', 'both'} or None, default 'axes' - The kind of object to return. The default is ``axes``. - - * 'axes' returns the matplotlib axes the boxplot is drawn on. - * 'dict' returns a dictionary whose values are the matplotlib - Lines of the boxplot. - * 'both' returns a namedtuple with the axes and dict. - * when grouping with ``by``, a Series mapping columns to - ``return_type`` is returned. - - If ``return_type`` is `None`, a NumPy array - of axes with the same shape as ``layout`` is returned. -%(backend)s\ - -**kwargs - All other plotting keyword arguments to be passed to - :func:`matplotlib.pyplot.boxplot`. - -Returns -------- -result - See Notes. - -See Also --------- -Series.plot.hist: Make a histogram. -matplotlib.pyplot.boxplot : Matplotlib equivalent plot. - -Notes ------ -The return type depends on the `return_type` parameter: - -* 'axes' : object of class matplotlib.axes.Axes -* 'dict' : dict of matplotlib.lines.Line2D objects -* 'both' : a namedtuple with structure (ax, lines) - -For data grouped with ``by``, return a Series of the above or a numpy -array: - -* :class:`~pandas.Series` -* :class:`~numpy.array` (for ``return_type = None``) - -Use ``return_type='dict'`` when you want to tweak the appearance -of the lines after plotting. In this case a dict containing the Lines -making up the boxes, caps, fliers, medians, and whiskers is returned. - -Examples --------- - -Boxplots can be created for every column in the dataframe -by ``df.boxplot()`` or indicating the columns to be used: - -.. plot:: - :context: close-figs - - >>> np.random.seed(1234) - >>> df = pd.DataFrame(np.random.randn(10, 4), - ... columns=['Col1', 'Col2', 'Col3', 'Col4']) - >>> boxplot = df.boxplot(column=['Col1', 'Col2', 'Col3']) # doctest: +SKIP - -Boxplots of variables distributions grouped by the values of a third -variable can be created using the option ``by``. For instance: - -.. plot:: - :context: close-figs - - >>> df = pd.DataFrame(np.random.randn(10, 2), - ... columns=['Col1', 'Col2']) - >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', - ... 'B', 'B', 'B', 'B', 'B']) - >>> boxplot = df.boxplot(by='X') - -A list of strings (i.e. ``['X', 'Y']``) can be passed to boxplot -in order to group the data by combination of the variables in the x-axis: - -.. plot:: - :context: close-figs - - >>> df = pd.DataFrame(np.random.randn(10, 3), - ... columns=['Col1', 'Col2', 'Col3']) - >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', - ... 'B', 'B', 'B', 'B', 'B']) - >>> df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A', - ... 'B', 'A', 'B', 'A', 'B']) - >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y']) - -The layout of boxplot can be adjusted giving a tuple to ``layout``: - -.. plot:: - :context: close-figs - - >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', - ... layout=(2, 1)) - -Additional formatting can be done to the boxplot, like suppressing the grid -(``grid=False``), rotating the labels in the x-axis (i.e. ``rot=45``) -or changing the fontsize (i.e. ``fontsize=15``): +def boxplot( + data: DataFrame, + column: str | list[str] | None = None, + by: str | list[str] | None = None, + ax: Axes | None = None, + fontsize: float | str | None = None, + rot: int = 0, + grid: bool = True, + figsize: tuple[float, float] | None = None, + layout: tuple[int, int] | None = None, + return_type: str | None = None, + **kwargs, +): + """ + Make a box plot from DataFrame columns. -.. plot:: - :context: close-figs + Make a box-and-whisker plot from DataFrame columns, optionally grouped + by some other columns. A box plot is a method for graphically depicting + groups of numerical data through their quartiles. + The box extends from the Q1 to Q3 quartile values of the data, + with a line at the median (Q2). The whiskers extend from the edges + of box to show the range of the data. By default, they extend no more than + `1.5 * IQR (IQR = Q3 - Q1)` from the edges of the box, ending at the farthest + data point within that interval. Outliers are plotted as separate dots. - >>> boxplot = df.boxplot(grid=False, rot=45, fontsize=15) # doctest: +SKIP + For further details see + Wikipedia's entry for `boxplot `_. -The parameter ``return_type`` can be used to select the type of element -returned by `boxplot`. When ``return_type='axes'`` is selected, -the matplotlib axes on which the boxplot is drawn are returned: + Parameters + ---------- + data : DataFrame + The data to visualize. + column : str or list of str, optional + Column name or list of names, or vector. + Can be any valid input to :meth:`pandas.DataFrame.groupby`. + by : str or array-like, optional + Column in the DataFrame to :meth:`pandas.DataFrame.groupby`. + One box-plot will be done per value of columns in `by`. + ax : object of class matplotlib.axes.Axes, optional + The matplotlib axes to be used by boxplot. + fontsize : float or str + Tick label font size in points or as a string (e.g., `large`). + rot : float, default 0 + The rotation angle of labels (in degrees) + with respect to the screen coordinate system. + grid : bool, default True + Setting this to True will show the grid. + figsize : A tuple (width, height) in inches + The size of the figure to create in matplotlib. + layout : tuple (rows, columns), optional + For example, (3, 5) will display the subplots + using 3 rows and 5 columns, starting from the top-left. + return_type : {'axes', 'dict', 'both'} or None, default 'axes' + The kind of object to return. The default is ``axes``. + + * 'axes' returns the matplotlib axes the boxplot is drawn on. + * 'dict' returns a dictionary whose values are the matplotlib + lines of the boxplot. + * 'both' returns a namedtuple with the axes and dict. + * when grouping with ``by``, a Series mapping columns to + ``return_type`` is returned. + + If ``return_type`` is `None`, a NumPy array + of axes with the same shape as ``layout`` is returned. - >>> boxplot = df.boxplot(column=['Col1', 'Col2'], return_type='axes') - >>> type(boxplot) - + **kwargs + All other plotting keyword arguments to be passed to + :func:`matplotlib.pyplot.boxplot`. -When grouping with ``by``, a Series mapping columns to ``return_type`` -is returned: + Returns + ------- + result + See Notes. - >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', - ... return_type='axes') - >>> type(boxplot) - + See Also + -------- + Series.plot.hist: Make a histogram. + matplotlib.pyplot.boxplot : Matplotlib equivalent plot. -If ``return_type`` is `None`, a NumPy array of axes with the same shape -as ``layout`` is returned: + Notes + ----- + The return type depends on the `return_type` parameter: - >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', - ... return_type=None) - >>> type(boxplot) - -""" + * 'axes' : object of class matplotlib.axes.Axes + * 'dict' : dict of matplotlib.lines.Line2D objects + * 'both' : a namedtuple with structure (ax, lines) -_backend_doc = """\ -backend : str, default None - Backend to use instead of the backend specified in the option - ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to - specify the ``plotting.backend`` for the whole session, set - ``pd.options.plotting.backend``. -""" + For data grouped with ``by``, return a Series of the above or a numpy + array: + * :class:`~pandas.Series` + * :class:`~numpy.array` (for ``return_type = None``) -_bar_or_line_doc = """ - Parameters - ---------- - x : label or position, optional - Allows plotting of one column versus another. If not specified, - the index of the DataFrame is used. - y : label or position, optional - Allows plotting of one column versus another. If not specified, - all numerical columns are used. - color : str, array-like, or dict, optional - The color for each of the DataFrame's columns. Possible values are: + Use ``return_type='dict'`` when you want to tweak the appearance + of the lines after plotting. In this case a dict containing the Lines + making up the boxes, caps, fliers, medians, and whiskers is returned. - - A single color string referred to by name, RGB or RGBA code, - for instance 'red' or '#a98d19'. + Examples + -------- - - A sequence of color strings referred to by name, RGB or RGBA - code, which will be used for each column recursively. For - instance ['green','yellow'] each column's %(kind)s will be filled in - green or yellow, alternatively. If there is only a single column to - be plotted, then only the first color from the color list will be - used. + Boxplots can be created for every column in the dataframe + by ``df.boxplot()`` or indicating the columns to be used: - - A dict of the form {column name : color}, so that each column will be - colored accordingly. For example, if your columns are called `a` and - `b`, then passing {'a': 'green', 'b': 'red'} will color %(kind)ss for - column `a` in green and %(kind)ss for column `b` in red. + .. plot:: + :context: close-figs - **kwargs - Additional keyword arguments are documented in - :meth:`DataFrame.plot`. + >>> np.random.seed(1234) + >>> df = pd.DataFrame( + ... np.random.randn(10, 4), columns=["Col1", "Col2", "Col3", "Col4"] + ... ) + >>> boxplot = df.boxplot(column=["Col1", "Col2", "Col3"]) # doctest: +SKIP - Returns - ------- - matplotlib.axes.Axes or np.ndarray of them - An ndarray is returned with one :class:`matplotlib.axes.Axes` - per column when ``subplots=True``. -""" + Boxplots of variables distributions grouped by the values of a third + variable can be created using the option ``by``. For instance: + .. plot:: + :context: close-figs -@Substitution(data="data : DataFrame\n The data to visualize.\n", backend="") -@Appender(_boxplot_doc) -def boxplot( - data: DataFrame, - column: str | list[str] | None = None, - by: str | list[str] | None = None, - ax: Axes | None = None, - fontsize: float | str | None = None, - rot: int = 0, - grid: bool = True, - figsize: tuple[float, float] | None = None, - layout: tuple[int, int] | None = None, - return_type: str | None = None, - **kwargs, -): + >>> df = pd.DataFrame(np.random.randn(10, 2), columns=["Col1", "Col2"]) + >>> df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) + >>> boxplot = df.boxplot(by="X") + + A list of strings (i.e. ``['X', 'Y']``) can be passed to boxplot + in order to group the data by combination of the variables in the x-axis: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame(np.random.randn(10, 3), columns=["Col1", "Col2", "Col3"]) + >>> df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) + >>> df["Y"] = pd.Series(["A", "B", "A", "B", "A", "B", "A", "B", "A", "B"]) + >>> boxplot = df.boxplot(column=["Col1", "Col2"], by=["X", "Y"]) + + The layout of boxplot can be adjusted giving a tuple to ``layout``: + + .. plot:: + :context: close-figs + + >>> boxplot = df.boxplot(column=["Col1", "Col2"], by="X", layout=(2, 1)) + + Additional formatting can be done to the boxplot, like suppressing the grid + (``grid=False``), rotating the labels in the x-axis (i.e. ``rot=45``) + or changing the fontsize (i.e. ``fontsize=15``): + + .. plot:: + :context: close-figs + + >>> boxplot = df.boxplot(grid=False, rot=45, fontsize=15) # doctest: +SKIP + + The parameter ``return_type`` can be used to select the type of element + returned by `boxplot`. When ``return_type='axes'`` is selected, + the matplotlib axes on which the boxplot is drawn are returned: + + >>> boxplot = df.boxplot(column=["Col1", "Col2"], return_type="axes") + >>> type(boxplot) + + + When grouping with ``by``, a Series mapping columns to ``return_type`` + is returned: + + >>> boxplot = df.boxplot(column=["Col1", "Col2"], by="X", return_type="axes") + >>> type(boxplot) + + + If ``return_type`` is `None`, a NumPy array of axes with the same shape + as ``layout`` is returned: + + >>> boxplot = df.boxplot(column=["Col1", "Col2"], by="X", return_type=None) + >>> type(boxplot) + + """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.boxplot( data, @@ -521,8 +458,6 @@ def boxplot( ) -@Substitution(data="", backend=_backend_doc) -@Appender(_boxplot_doc) def boxplot_frame( self: DataFrame, column=None, @@ -537,6 +472,168 @@ def boxplot_frame( backend=None, **kwargs, ): + """ + Make a box plot from DataFrame columns. + + Make a box-and-whisker plot from DataFrame columns, optionally grouped + by some other columns. A box plot is a method for graphically depicting + groups of numerical data through their quartiles. + The box extends from the Q1 to Q3 quartile values of the data, + with a line at the median (Q2). The whiskers extend from the edges + of box to show the range of the data. By default, they extend no more than + `1.5 * IQR (IQR = Q3 - Q1)` from the edges of the box, ending at the farthest + data point within that interval. Outliers are plotted as separate dots. + + For further details see + Wikipedia's entry for `boxplot `_. + + Parameters + ---------- + column : str or list of str, optional + Column name or list of names, or vector. + Can be any valid input to :meth:`pandas.DataFrame.groupby`. + by : str or array-like, optional + Column in the DataFrame to :meth:`pandas.DataFrame.groupby`. + One box-plot will be done per value of columns in `by`. + ax : object of class matplotlib.axes.Axes, optional + The matplotlib axes to be used by boxplot. + fontsize : float or str + Tick label font size in points or as a string (e.g., `large`). + rot : float, default 0 + The rotation angle of labels (in degrees) + with respect to the screen coordinate system. + grid : bool, default True + Setting this to True will show the grid. + figsize : A tuple (width, height) in inches + The size of the figure to create in matplotlib. + layout : tuple (rows, columns), optional + For example, (3, 5) will display the subplots + using 3 rows and 5 columns, starting from the top-left. + return_type : {'axes', 'dict', 'both'} or None, default 'axes' + The kind of object to return. The default is ``axes``. + + * 'axes' returns the matplotlib axes the boxplot is drawn on. + * 'dict' returns a dictionary whose values are the matplotlib + lines of the boxplot. + * 'both' returns a namedtuple with the axes and dict. + * when grouping with ``by``, a Series mapping columns to + ``return_type`` is returned. + + If ``return_type`` is `None`, a NumPy array + of axes with the same shape as ``layout`` is returned. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + + **kwargs + All other plotting keyword arguments to be passed to + :func:`matplotlib.pyplot.boxplot`. + + Returns + ------- + result + See Notes. + + See Also + -------- + Series.plot.hist: Make a histogram. + matplotlib.pyplot.boxplot : Matplotlib equivalent plot. + + Notes + ----- + The return type depends on the `return_type` parameter: + + * 'axes' : object of class matplotlib.axes.Axes + * 'dict' : dict of matplotlib.lines.Line2D objects + * 'both' : a namedtuple with structure (ax, lines) + + For data grouped with ``by``, return a Series of the above or a numpy + array: + + * :class:`~pandas.Series` + * :class:`~numpy.array` (for ``return_type = None``) + + Use ``return_type='dict'`` when you want to tweak the appearance + of the lines after plotting. In this case a dict containing the Lines + making up the boxes, caps, fliers, medians, and whiskers is returned. + + Examples + -------- + + Boxplots can be created for every column in the dataframe + by ``df.boxplot()`` or indicating the columns to be used: + + .. plot:: + :context: close-figs + + >>> np.random.seed(1234) + >>> df = pd.DataFrame( + ... np.random.randn(10, 4), columns=["Col1", "Col2", "Col3", "Col4"] + ... ) + >>> boxplot = df.boxplot(column=["Col1", "Col2", "Col3"]) # doctest: +SKIP + + Boxplots of variables distributions grouped by the values of a third + variable can be created using the option ``by``. For instance: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame(np.random.randn(10, 2), columns=["Col1", "Col2"]) + >>> df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) + >>> boxplot = df.boxplot(by="X") + + A list of strings (i.e. ``['X', 'Y']``) can be passed to boxplot + in order to group the data by combination of the variables in the x-axis: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame(np.random.randn(10, 3), columns=["Col1", "Col2", "Col3"]) + >>> df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) + >>> df["Y"] = pd.Series(["A", "B", "A", "B", "A", "B", "A", "B", "A", "B"]) + >>> boxplot = df.boxplot(column=["Col1", "Col2"], by=["X", "Y"]) + + The layout of boxplot can be adjusted giving a tuple to ``layout``: + + .. plot:: + :context: close-figs + + >>> boxplot = df.boxplot(column=["Col1", "Col2"], by="X", layout=(2, 1)) + + Additional formatting can be done to the boxplot, like suppressing the grid + (``grid=False``), rotating the labels in the x-axis (i.e. ``rot=45``) + or changing the fontsize (i.e. ``fontsize=15``): + + .. plot:: + :context: close-figs + + >>> boxplot = df.boxplot(grid=False, rot=45, fontsize=15) # doctest: +SKIP + + The parameter ``return_type`` can be used to select the type of element + returned by `boxplot`. When ``return_type='axes'`` is selected, + the matplotlib axes on which the boxplot is drawn are returned: + + >>> boxplot = df.boxplot(column=["Col1", "Col2"], return_type="axes") + >>> type(boxplot) + + + When grouping with ``by``, a Series mapping columns to ``return_type`` + is returned: + + >>> boxplot = df.boxplot(column=["Col1", "Col2"], by="X", return_type="axes") + >>> type(boxplot) + + + If ``return_type`` is `None`, a NumPy array of axes with the same shape + as ``layout`` is returned: + + >>> boxplot = df.boxplot(column=["Col1", "Col2"], by="X", return_type=None) + >>> type(boxplot) + + """ + plot_backend = _get_plot_backend(backend) return plot_backend.boxplot_frame( self, @@ -1074,8 +1171,55 @@ def __call__(self, *args, **kwargs): __call__.__doc__ = __doc__ - @Appender( + def line( + self, + x: Hashable | None = None, + y: Hashable | None = None, + color: str | Sequence[str] | dict | None = None, + **kwargs, + ) -> PlotAccessor: """ + Plot Series or DataFrame as lines. + + This function is useful to plot lines using DataFrame's values + as coordinates. + + Parameters + ---------- + x : label or position, optional + Allows plotting of one column versus another. If not specified, + the index of the DataFrame is used. + y : label or position, optional + Allows plotting of one column versus another. If not specified, + all numerical columns are used. + color : str, array-like, or dict, optional + The color for each of the DataFrame's columns. Possible values are: + + - A single color string referred to by name, RGB or RGBA code, + for instance 'red' or '#a98d19'. + + - A sequence of color strings referred to by name, RGB or RGBA + code, which will be used for each column recursively. For + instance ['green','yellow'] each column's line will be filled in + green or yellow, alternatively. If there is only a single column to + be plotted, then only the first color from the color list will be + used. + + - A dict of the form {column name : color}, so that each column will be + colored accordingly. For example, if your columns are called `a` and + `b`, then passing {'a': 'green', 'b': 'red'} will color lines for + column `a` in green and lines for column `b` in red. + + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + matplotlib.axes.Axes or np.ndarray of them + An ndarray is returned with one :class:`matplotlib.axes.Axes` + per column when ``subplots=True``. + See Also -------- matplotlib.pyplot.plot : Plot y versus x as lines and/or markers. @@ -1095,30 +1239,33 @@ def __call__(self, *args, **kwargs): The following example shows the populations for some animals over the years. - >>> df = pd.DataFrame({ - ... 'pig': [20, 18, 489, 675, 1776], - ... 'horse': [4, 25, 281, 600, 1900] - ... }, index=[1990, 1997, 2003, 2009, 2014]) + >>> df = pd.DataFrame( + ... { + ... "pig": [20, 18, 489, 675, 1776], + ... "horse": [4, 25, 281, 600, 1900], + ... }, + ... index=[1990, 1997, 2003, 2009, 2014], + ... ) >>> lines = df.plot.line() .. plot:: - :context: close-figs + :context: close-figs - An example with subplots, so an array of axes is returned. + An example with subplots, so an array of axes is returned. - >>> axes = df.plot.line(subplots=True) - >>> type(axes) - + >>> axes = df.plot.line(subplots=True) + >>> type(axes) + .. plot:: - :context: close-figs + :context: close-figs - Let's repeat the same example, but specifying colors for - each column (in this case, for each animal). + Let's repeat the same example, but specifying colors for + each column (in this case, for each animal). - >>> axes = df.plot.line( - ... subplots=True, color={"pig": "pink", "horse": "#742802"} - ... ) + >>> axes = df.plot.line( + ... subplots=True, color={"pig": "pink", "horse": "#742802"} + ... ) .. plot:: :context: close-figs @@ -1126,12 +1273,13 @@ def __call__(self, *args, **kwargs): The following example shows the relationship between both populations. - >>> lines = df.plot.line(x='pig', y='horse') + >>> lines = df.plot.line(x="pig", y="horse") """ - ) - @Substitution(kind="line") - @Appender(_bar_or_line_doc) - def line( + if color is not None: + kwargs["color"] = color + return self(kind="line", x=x, y=y, **kwargs) + + def bar( self, x: Hashable | None = None, y: Hashable | None = None, @@ -1139,17 +1287,50 @@ def line( **kwargs, ) -> PlotAccessor: """ - Plot Series or DataFrame as lines. + Vertical bar plot. - This function is useful to plot lines using DataFrame's values - as coordinates. - """ - if color is not None: - kwargs["color"] = color - return self(kind="line", x=x, y=y, **kwargs) + A bar plot is a plot that presents categorical data with + rectangular bars with lengths proportional to the values that they + represent. A bar plot shows comparisons among discrete categories. One + axis of the plot shows the specific categories being compared, and the + other axis represents a measured value. + + Parameters + ---------- + x : label or position, optional + Allows plotting of one column versus another. If not specified, + the index of the DataFrame is used. + y : label or position, optional + Allows plotting of one column versus another. If not specified, + all numerical columns are used. + color : str, array-like, or dict, optional + The color for each of the DataFrame's columns. Possible values are: + + - A single color string referred to by name, RGB or RGBA code, + for instance 'red' or '#a98d19'. + + - A sequence of color strings referred to by name, RGB or RGBA + code, which will be used for each column recursively. For + instance ['green','yellow'] each column's bar will be filled in + green or yellow, alternatively. If there is only a single column to + be plotted, then only the first color from the color list will be + used. + + - A dict of the form {column name : color}, so that each column will be + colored accordingly. For example, if your columns are called `a` and + `b`, then passing {'a': 'green', 'b': 'red'} will color bars for + column `a` in green and bars for column `b` in red. + + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + matplotlib.axes.Axes or np.ndarray of them + An ndarray is returned with one :class:`matplotlib.axes.Axes` + per column when ``subplots=True``. - @Appender( - """ See Also -------- DataFrame.plot.barh : Horizontal bar plot. @@ -1163,8 +1344,8 @@ def line( .. plot:: :context: close-figs - >>> df = pd.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]}) - >>> ax = df.plot.bar(x='lab', y='val', rot=0) + >>> df = pd.DataFrame({"lab": ["A", "B", "C"], "val": [10, 30, 20]}) + >>> ax = df.plot.bar(x="lab", y="val", rot=0) Plot a whole dataframe to a bar plot. Each column is assigned a distinct color, and each row is nested in a group along the @@ -1175,10 +1356,16 @@ def line( >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] - >>> index = ['snail', 'pig', 'elephant', - ... 'rabbit', 'giraffe', 'coyote', 'horse'] - >>> df = pd.DataFrame({'speed': speed, - ... 'lifespan': lifespan}, index=index) + >>> index = [ + ... "snail", + ... "pig", + ... "elephant", + ... "rabbit", + ... "giraffe", + ... "coyote", + ... "horse", + ... ] + >>> df = pd.DataFrame({"speed": speed, "lifespan": lifespan}, index=index) >>> ax = df.plot.bar(rot=0) Plot stacked bar charts for the DataFrame @@ -1205,7 +1392,9 @@ def line( :context: close-figs >>> axes = df.plot.bar( - ... rot=0, subplots=True, color={"speed": "red", "lifespan": "green"} + ... rot=0, + ... subplots=True, + ... color={"speed": "red", "lifespan": "green"}, ... ) >>> axes[1].legend(loc=2) # doctest: +SKIP @@ -1214,19 +1403,20 @@ def line( .. plot:: :context: close-figs - >>> ax = df.plot.bar(y='speed', rot=0) + >>> ax = df.plot.bar(y="speed", rot=0) Plot only selected categories for the DataFrame. .. plot:: :context: close-figs - >>> ax = df.plot.bar(x='lifespan', rot=0) - """ - ) - @Substitution(kind="bar") - @Appender(_bar_or_line_doc) - def bar( + >>> ax = df.plot.bar(x="lifespan", rot=0) + """ + if color is not None: + kwargs["color"] = color + return self(kind="bar", x=x, y=y, **kwargs) + + def barh( self, x: Hashable | None = None, y: Hashable | None = None, @@ -1234,20 +1424,50 @@ def bar( **kwargs, ) -> PlotAccessor: """ - Vertical bar plot. + Make a horizontal bar plot. - A bar plot is a plot that presents categorical data with + A horizontal bar plot is a plot that presents quantitative data with rectangular bars with lengths proportional to the values that they represent. A bar plot shows comparisons among discrete categories. One axis of the plot shows the specific categories being compared, and the other axis represents a measured value. - """ - if color is not None: - kwargs["color"] = color - return self(kind="bar", x=x, y=y, **kwargs) - @Appender( - """ + Parameters + ---------- + x : label or position, optional + Allows plotting of one column versus another. If not specified, + the index of the DataFrame is used. + y : label or position, optional + Allows plotting of one column versus another. If not specified, + all numerical columns are used. + color : str, array-like, or dict, optional + The color for each of the DataFrame's columns. Possible values are: + + - A single color string referred to by name, RGB or RGBA code, + for instance 'red' or '#a98d19'. + + - A sequence of color strings referred to by name, RGB or RGBA + code, which will be used for each column recursively. For + instance ['green','yellow'] each column's bar will be filled in + green or yellow, alternatively. If there is only a single column to + be plotted, then only the first color from the color list will be + used. + + - A dict of the form {column name : color}, so that each column will be + colored accordingly. For example, if your columns are called `a` and + `b`, then passing {'a': 'green', 'b': 'red'} will color bars for + column `a` in green and bars for column `b` in red. + + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + matplotlib.axes.Axes or np.ndarray of them + An ndarray is returned with one :class:`matplotlib.axes.Axes` + per column when ``subplots=True``. + See Also -------- DataFrame.plot.bar : Vertical bar plot. @@ -1261,8 +1481,8 @@ def bar( .. plot:: :context: close-figs - >>> df = pd.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]}) - >>> ax = df.plot.barh(x='lab', y='val') + >>> df = pd.DataFrame({"lab": ["A", "B", "C"], "val": [10, 30, 20]}) + >>> ax = df.plot.barh(x="lab", y="val") Plot a whole DataFrame to a horizontal bar plot @@ -1271,10 +1491,16 @@ def bar( >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] - >>> index = ['snail', 'pig', 'elephant', - ... 'rabbit', 'giraffe', 'coyote', 'horse'] - >>> df = pd.DataFrame({'speed': speed, - ... 'lifespan': lifespan}, index=index) + >>> index = [ + ... "snail", + ... "pig", + ... "elephant", + ... "rabbit", + ... "giraffe", + ... "coyote", + ... "horse", + ... ] + >>> df = pd.DataFrame({"speed": speed, "lifespan": lifespan}, index=index) >>> ax = df.plot.barh() Plot stacked barh charts for the DataFrame @@ -1298,11 +1524,17 @@ def bar( >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] - >>> index = ['snail', 'pig', 'elephant', - ... 'rabbit', 'giraffe', 'coyote', 'horse'] - >>> df = pd.DataFrame({'speed': speed, - ... 'lifespan': lifespan}, index=index) - >>> ax = df.plot.barh(y='speed') + >>> index = [ + ... "snail", + ... "pig", + ... "elephant", + ... "rabbit", + ... "giraffe", + ... "coyote", + ... "horse", + ... ] + >>> df = pd.DataFrame({"speed": speed, "lifespan": lifespan}, index=index) + >>> ax = df.plot.barh(y="speed") Plot DataFrame versus the desired column @@ -1311,30 +1543,17 @@ def bar( >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] - >>> index = ['snail', 'pig', 'elephant', - ... 'rabbit', 'giraffe', 'coyote', 'horse'] - >>> df = pd.DataFrame({'speed': speed, - ... 'lifespan': lifespan}, index=index) - >>> ax = df.plot.barh(x='lifespan') - """ - ) - @Substitution(kind="bar") - @Appender(_bar_or_line_doc) - def barh( - self, - x: Hashable | None = None, - y: Hashable | None = None, - color: str | Sequence[str] | dict | None = None, - **kwargs, - ) -> PlotAccessor: - """ - Make a horizontal bar plot. - - A horizontal bar plot is a plot that presents quantitative data with - rectangular bars with lengths proportional to the values that they - represent. A bar plot shows comparisons among discrete categories. One - axis of the plot shows the specific categories being compared, and the - other axis represents a measured value. + >>> index = [ + ... "snail", + ... "pig", + ... "elephant", + ... "rabbit", + ... "giraffe", + ... "coyote", + ... "horse", + ... ] + >>> df = pd.DataFrame({"speed": speed, "lifespan": lifespan}, index=index) + >>> ax = df.plot.barh(x="lifespan") """ if color is not None: kwargs["color"] = color diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 849a81eaf56d9..719b655a7c860 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -538,5 +538,28 @@ def test_set_module(): assert pd.pivot.__module__ == "pandas" assert pd.cut.__module__ == "pandas" assert pd.qcut.__module__ == "pandas" + assert pd.read_clipboard.__module__ == "pandas" + assert pd.ExcelFile.__module__ == "pandas" + assert pd.ExcelWriter.__module__ == "pandas" + assert pd.read_excel.__module__ == "pandas" + assert pd.read_feather.__module__ == "pandas" + assert pd.set_eng_float_format.__module__ == "pandas" + assert pd.read_html.__module__ == "pandas" + assert pd.read_iceberg.__module__ == "pandas" + assert pd.read_json.__module__ == "pandas" + assert pd.json_normalize.__module__ == "pandas" + assert pd.read_orc.__module__ == "pandas" + assert pd.read_parquet.__module__ == "pandas" + assert pd.read_pickle.__module__ == "pandas" + assert pd.to_pickle.__module__ == "pandas" + assert pd.HDFStore.__module__ == "pandas" + assert pd.read_hdf.__module__ == "pandas" + assert pd.read_sas.__module__ == "pandas" + assert pd.read_spss.__module__ == "pandas" + assert pd.read_sql.__module__ == "pandas" + assert pd.read_sql_query.__module__ == "pandas" + assert pd.read_sql_table.__module__ == "pandas" + assert pd.read_stata.__module__ == "pandas" + assert pd.read_xml.__module__ == "pandas" assert api.typing.SeriesGroupBy.__module__ == "pandas.api.typing" assert api.typing.DataFrameGroupBy.__module__ == "pandas.api.typing" diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d2d65c4b983a7..c810f098f15cf 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3691,3 +3691,12 @@ def test_setitem_float_nan_is_na(using_nan_is_na): ser[2] = np.nan assert isinstance(ser[2], float) assert np.isnan(ser[2]) + + +def test_pow_with_all_na_float(): + # GH#62520 + + s = pd.Series([None, None], dtype="float64[pyarrow]") + result = s.pow(2) + expected = pd.Series([pd.NA, pd.NA], dtype="float64[pyarrow]") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 50656ca85e90a..90e27cbcf412b 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -371,6 +371,13 @@ def test_describe_when_include_all_exclude_not_allowed(self, exclude): with pytest.raises(ValueError, match=msg): df.describe(include="all", exclude=exclude) + def test_describe_when_included_dtypes_not_present(self): + # GH#61863 + df = DataFrame({"a": [1, 2, 3]}) + msg = "No columns match the specified include or exclude data types" + with pytest.raises(ValueError, match=msg): + df.describe(include=["datetime"]) + def test_describe_with_duplicate_columns(self): df = DataFrame( [[1, 1, 1], [2, 2, 2], [3, 3, 3]], diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 17aaf6a4b1108..ddb58ecbfa6f3 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -8,6 +8,7 @@ from pandas.compat import pa_version_under16p0 from pandas.errors import IndexingError +import pandas.util._test_decorators as td from pandas import ( NA, @@ -1535,3 +1536,15 @@ def test_iloc_arrow_extension_array(self): expected = df.iloc[:, df["c"]] result = df_arrow.iloc[:, df_arrow["c"]] tm.assert_frame_equal(result, expected, check_dtype=False) + + @td.skip_if_no("pyarrow") + def test_setitem_pyarrow_int_series(self): + # GH#62462 + ser = Series([1, 2, 3], dtype="int64[pyarrow]") + idx = Index([0, 1]) + vals = Series([7, 8], dtype="int64[pyarrow]") + + ser.iloc[idx] = vals + + expected = Series([7, 8, 3], dtype="int64[pyarrow]") + tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 75ec96409bdd0..3c9e7c80f9db0 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -295,29 +295,28 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator): tm.assert_frame_equal(result, expected) -def test_read_csv_memory_growth_chunksize(all_parsers): +def test_read_csv_memory_growth_chunksize(temp_file, all_parsers): # see gh-24805 # # Let's just make sure that we don't crash # as we iteratively process all chunks. parser = all_parsers - with tm.ensure_clean() as path: - with open(path, "w", encoding="utf-8") as f: - for i in range(1000): - f.write(str(i) + "\n") - - if parser.engine == "pyarrow": - msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with parser.read_csv(path, chunksize=20) as result: - for _ in result: - pass - return - - with parser.read_csv(path, chunksize=20) as result: - for _ in result: - pass + with open(temp_file, "w", encoding="utf-8") as f: + for i in range(1000): + f.write(str(i) + "\n") + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(temp_file, chunksize=20) as result: + for _ in result: + pass + return + + with parser.read_csv(temp_file, chunksize=20) as result: + for _ in result: + pass def test_chunksize_with_usecols_second_block_shorter(all_parsers): diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 598f397da686d..072294d34fb75 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -77,3 +77,30 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): expected = DataFrame({"data": [f"10E{exp}"]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "value, expected_value", + [ + ("32.0", 32.0), + ("32e0", 32.0), + ("3.2e1", 32.0), + ("3.2e80", 3.2e80), + ("3.2e-80", 3.2e-80), + ("18446744073709551616.0", float(1 << 64)), # loses precision + ("18446744073709551616.5", float(1 << 64)), # loses precision + ("36893488147419103232.3", float(1 << 65)), # loses precision + ], +) +def test_small_int_followed_by_float( + all_parsers_all_precisions, value, expected_value, request +): + # GH#51295 + parser, precision = all_parsers_all_precisions + data = f"""data + 42 + {value}""" + result = parser.read_csv(StringIO(data), float_precision=precision) + expected = DataFrame({"data": [42.0, expected_value]}) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 9322e8d54f5b8..87db540f6293a 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -144,17 +144,22 @@ def test_int64_overflow(all_parsers, conv, request): if parser.engine == "pyarrow": mark = pytest.mark.xfail(reason="parses to float64") request.applymarker(mark) + elif parser.engine == "python": + mark = pytest.mark.xfail( + reason="TODO: Python engine reads bigint as string" + ) + request.applymarker(mark) result = parser.read_csv(StringIO(data)) expected = DataFrame( [ - "00013007854817840016671868", - "00013007854817840016749251", - "00013007854817840016754630", - "00013007854817840016781876", - "00013007854817840017028824", - "00013007854817840017963235", - "00013007854817840018860166", + 13007854817840016671868, + 13007854817840016749251, + 13007854817840016754630, + 13007854817840016781876, + 13007854817840017028824, + 13007854817840017963235, + 13007854817840018860166, ], columns=["ID"], ) @@ -185,7 +190,7 @@ def test_int64_overflow(all_parsers, conv, request): ) def test_int64_uint64_range(all_parsers, val): # These numbers fall right inside the int64-uint64 - # range, so they should be parsed as string. + # range, so they should be parsed as integer. parser = all_parsers result = parser.read_csv(StringIO(str(val)), header=None) @@ -197,13 +202,30 @@ def test_int64_uint64_range(all_parsers, val): @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) -def test_outside_int64_uint64_range(all_parsers, val): +def test_outside_int64_uint64_range(all_parsers, val, request): # These numbers fall just outside the int64-uint64 - # range, so they should be parsed as string. + # range, so they should be parsed as object. parser = all_parsers + if parser.engine == "python": + mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string") + request.applymarker(mark) + result = parser.read_csv(StringIO(str(val)), header=None) - expected = DataFrame([str(val)]) + expected = DataFrame([val]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] +) +def test_outside_int64_uint64_range_follow_str(all_parsers, val): + parser = all_parsers + + result = parser.read_csv(StringIO(f"{val}\nabc"), header=None) + + expected = DataFrame([str(val), "abc"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py index 668aab05b9fa4..5841fb7ad9594 100644 --- a/pandas/tests/io/parser/common/test_iterator.py +++ b/pandas/tests/io/parser/common/test_iterator.py @@ -142,19 +142,18 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs): pass -def test_iteration_open_handle(all_parsers): +def test_iteration_open_handle(temp_file, all_parsers): parser = all_parsers kwargs = {"header": None} - with tm.ensure_clean() as path: - with open(path, "w", encoding="utf-8") as f: - f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG") + with open(temp_file, "w", encoding="utf-8") as f: + f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG") - with open(path, encoding="utf-8") as f: - for line in f: - if "CCC" in line: - break + with open(temp_file, encoding="utf-8") as f: + for line in f: + if "CCC" in line: + break - result = parser.read_csv(f, **kwargs) - expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]}) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(f, **kwargs) + expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index f3019bd1c8187..b8cf435ef0443 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -246,7 +246,7 @@ def test_null_byte_char(request, all_parsers): @pytest.mark.filterwarnings("always::ResourceWarning") -def test_open_file(all_parsers): +def test_open_file(all_parsers, temp_file): # GH 39024 parser = all_parsers @@ -259,14 +259,13 @@ def test_open_file(all_parsers): msg = "'utf-8' codec can't decode byte 0xe4" err = ValueError - with tm.ensure_clean() as path: - file = Path(path) - file.write_bytes(b"\xe4\na\n1") + file = Path(temp_file) + file.write_bytes(b"\xe4\na\n1") - with tm.assert_produces_warning(None): - # should not trigger a ResourceWarning - with pytest.raises(err, match=msg): - parser.read_csv(file, sep=None, encoding_errors="replace") + with tm.assert_produces_warning(None): + # should not trigger a ResourceWarning + with pytest.raises(err, match=msg): + parser.read_csv(file, sep=None, encoding_errors="replace") def test_invalid_on_bad_line(all_parsers): diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index e4563afc631c5..daf8ec83bff77 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -29,7 +29,9 @@ @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): +def test_dtype_all_columns( + all_parsers, dtype, check_orig, using_infer_string, tmp_path +): # see gh-3795, gh-6607 parser = all_parsers @@ -39,20 +41,20 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): index=["1A", "1B", "1C", "1D", "1E"], ) - with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: - df.to_csv(path) + path = tmp_path / "__passing_str_as_dtype__.csv" + df.to_csv(path) - result = parser.read_csv(path, dtype=dtype, index_col=0) + result = parser.read_csv(path, dtype=dtype, index_col=0) - if check_orig: - expected = df.copy() - result = result.astype(float) - elif using_infer_string and dtype is str: - expected = df.astype(str) - else: - expected = df.astype(str).astype(object) + if check_orig: + expected = df.copy() + result = result.astype(float) + elif using_infer_string and dtype is str: + expected = df.astype(str) + else: + expected = df.astype(str).astype(object) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) @pytest.mark.usefixtures("pyarrow_xfail") diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 469fe84a80dcd..5ef4eeb3dc927 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -130,7 +130,7 @@ def test_dtype_and_names_error(c_parser_only): ], ids=["dt64-0", "dt64-1", "td64", f"{tm.ENDIAN}U8"], ) -def test_unsupported_dtype(c_parser_only, match, kwargs): +def test_unsupported_dtype(c_parser_only, match, kwargs, tmp_path): parser = c_parser_only df = DataFrame( np.random.default_rng(2).random((5, 2)), @@ -138,11 +138,11 @@ def test_unsupported_dtype(c_parser_only, match, kwargs): index=["1A", "1B", "1C", "1D", "1E"], ) - with tm.ensure_clean("__unsupported_dtype__.csv") as path: - df.to_csv(path) + path = tmp_path / "__unsupported_dtype__.csv" + df.to_csv(path) - with pytest.raises(TypeError, match=match): - parser.read_csv(path, index_col=0, **kwargs) + with pytest.raises(TypeError, match=match): + parser.read_csv(path, index_col=0, **kwargs) @td.skip_if_32bit @@ -563,27 +563,27 @@ def test_file_handles_mmap(c_parser_only, csv1): assert not m.closed -def test_file_binary_mode(c_parser_only): +def test_file_binary_mode(c_parser_only, temp_file): # see gh-23779 parser = c_parser_only expected = DataFrame([[1, 2, 3], [4, 5, 6]]) - with tm.ensure_clean() as path: - with open(path, "w", encoding="utf-8") as f: - f.write("1,2,3\n4,5,6") + path = temp_file + with open(path, "w", encoding="utf-8") as f: + f.write("1,2,3\n4,5,6") - with open(path, "rb") as f: - result = parser.read_csv(f, header=None) - tm.assert_frame_equal(result, expected) + with open(path, "rb") as f: + result = parser.read_csv(f, header=None) + tm.assert_frame_equal(result, expected) -def test_unix_style_breaks(c_parser_only): +def test_unix_style_breaks(c_parser_only, temp_file): # GH 11020 parser = c_parser_only - with tm.ensure_clean() as path: - with open(path, "w", newline="\n", encoding="utf-8") as f: - f.write("blah\n\ncol_1,col_2,col_3\n\n") - result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") + path = temp_file + with open(path, "w", newline="\n", encoding="utf-8") as f: + f.write("blah\n\ncol_1,col_2,col_3\n\n") + result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") expected = DataFrame(columns=["col_1", "col_2", "col_3"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 191d0de50b12f..9db820cc1566c 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -35,59 +35,61 @@ def parser_and_data(all_parsers, csv1): @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) -def test_zip(parser_and_data, compression): +def test_zip(tmp_path, parser_and_data, compression): parser, data, expected = parser_and_data - with tm.ensure_clean("test_file.zip") as path: - with zipfile.ZipFile(path, mode="w") as tmp: - tmp.writestr("test_file", data) + path = tmp_path / "test_file.zip" + with zipfile.ZipFile(path, mode="w") as tmp: + tmp.writestr("test_file", data) - if compression == "zip2": - with open(path, "rb") as f: - result = parser.read_csv(f, compression="zip") - else: - result = parser.read_csv(path, compression=compression) + if compression == "zip2": + with open(path, "rb") as f: + result = parser.read_csv(f, compression="zip") + else: + result = parser.read_csv(path, compression=compression) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("compression", ["zip", "infer"]) -def test_zip_error_multiple_files(parser_and_data, compression): +def test_zip_error_multiple_files(tmp_path, parser_and_data, compression): parser, data, expected = parser_and_data - with tm.ensure_clean("combined_zip.zip") as path: - inner_file_names = ["test_file", "second_file"] + path = tmp_path / "combined_zip.zip" + inner_file_names = ["test_file", "second_file"] - with zipfile.ZipFile(path, mode="w") as tmp: - for file_name in inner_file_names: - tmp.writestr(file_name, data) + with zipfile.ZipFile(path, mode="w") as tmp: + for file_name in inner_file_names: + tmp.writestr(file_name, data) - with pytest.raises(ValueError, match="Multiple files"): - parser.read_csv(path, compression=compression) + with pytest.raises(ValueError, match="Multiple files"): + parser.read_csv(path, compression=compression) -def test_zip_error_no_files(parser_and_data): +def test_zip_error_no_files(tmp_path, parser_and_data): parser, _, _ = parser_and_data - with tm.ensure_clean() as path: - with zipfile.ZipFile(path, mode="w"): - pass + path = tmp_path / "test_file.zip" + with zipfile.ZipFile(path, mode="w"): + pass - with pytest.raises(ValueError, match="Zero files"): - parser.read_csv(path, compression="zip") + with pytest.raises(ValueError, match="Zero files"): + parser.read_csv(path, compression="zip") -def test_zip_error_invalid_zip(parser_and_data): +def test_zip_error_invalid_zip(tmp_path, parser_and_data): parser, _, _ = parser_and_data - with tm.ensure_clean() as path: - with open(path, "rb") as f: - with pytest.raises(zipfile.BadZipFile, match="File is not a zip file"): - parser.read_csv(f, compression="zip") + path = tmp_path / "invalid_file.zip" + path.touch() + with open(path, "rb") as f: + with pytest.raises(zipfile.BadZipFile, match="File is not a zip file"): + parser.read_csv(f, compression="zip") @pytest.mark.parametrize("filename", [None, "test.{ext}"]) def test_compression( + tmp_path, request, parser_and_data, compression_only, @@ -108,17 +110,17 @@ def test_compression( ) ) - with tm.ensure_clean(filename=filename) as path: - tm.write_to_compressed(compress_type, path, data) - compression = "infer" if filename else compress_type + path = tmp_path / filename if filename else tmp_path / "test_file" + tm.write_to_compressed(compress_type, path, data) + compression = "infer" if filename else compress_type - if buffer: - with open(path, "rb") as f: - result = parser.read_csv(f, compression=compression) - else: - result = parser.read_csv(path, compression=compression) + if buffer: + with open(path, "rb") as f: + result = parser.read_csv(f, compression=compression) + else: + result = parser.read_csv(path, compression=compression) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("ext", [None, "gz", "bz2"]) @@ -175,21 +177,22 @@ def test_compression_tar_archive(all_parsers, csv_dir_path): assert list(df.columns) == ["a"] -def test_ignore_compression_extension(all_parsers): +def test_ignore_compression_extension(tmp_path, all_parsers): parser = all_parsers df = DataFrame({"a": [0, 1]}) - with tm.ensure_clean("test.csv") as path_csv: - with tm.ensure_clean("test.csv.zip") as path_zip: - # make sure to create un-compressed file with zip extension - df.to_csv(path_csv, index=False) - Path(path_zip).write_text( - Path(path_csv).read_text(encoding="utf-8"), encoding="utf-8" - ) - tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df) + path_csv = tmp_path / "test.csv" + path_zip = tmp_path / "test.csv.zip" + # make sure to create un-compressed file with zip extension + df.to_csv(path_csv, index=False) + Path(path_zip).write_text( + Path(path_csv).read_text(encoding="utf-8"), encoding="utf-8" + ) + + tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df) -def test_writes_tar_gz(all_parsers): +def test_writes_tar_gz(tmp_path, all_parsers): parser = all_parsers data = DataFrame( { @@ -197,15 +200,15 @@ def test_writes_tar_gz(all_parsers): "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."], } ) - with tm.ensure_clean("test.tar.gz") as tar_path: - data.to_csv(tar_path, index=False) + tar_path = tmp_path / "test.tar.gz" + data.to_csv(tar_path, index=False) - # test that read_csv infers .tar.gz to gzip: - tm.assert_frame_equal(parser.read_csv(tar_path), data) + # test that read_csv infers .tar.gz to gzip: + tm.assert_frame_equal(parser.read_csv(tar_path), data) - # test that file is indeed gzipped: - with tarfile.open(tar_path, "r:gz") as tar: - result = parser.read_csv( - tar.extractfile(tar.getnames()[0]), compression="infer" - ) - tm.assert_frame_equal(result, data) + # test that file is indeed gzipped: + with tarfile.open(tar_path, "r:gz") as tar: + result = parser.read_csv( + tar.extractfile(tar.getnames()[0]), compression="infer" + ) + tm.assert_frame_equal(result, data) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 9977e2b8e1a1d..c5b43484f7615 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -200,7 +200,7 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): @xfail_pyarrow # ValueError: Found non-unique column index -def test_no_multi_index_level_names_empty(all_parsers): +def test_no_multi_index_level_names_empty(temp_file, all_parsers): # GH 10984 parser = all_parsers midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) @@ -209,9 +209,8 @@ def test_no_multi_index_level_names_empty(all_parsers): index=midx, columns=["x", "y", "z"], ) - with tm.ensure_clean() as path: - expected.to_csv(path) - result = parser.read_csv(path, index_col=[0, 1, 2]) + expected.to_csv(temp_file) + result = parser.read_csv(temp_file, index_col=[0, 1, 2]) tm.assert_frame_equal(result, expected) @@ -240,7 +239,7 @@ def test_header_with_index_col(all_parsers): @pytest.mark.slow -def test_index_col_large_csv(all_parsers, monkeypatch): +def test_index_col_large_csv(temp_file, all_parsers, monkeypatch): # https://github.com/pandas-dev/pandas/issues/37094 parser = all_parsers @@ -252,11 +251,10 @@ def test_index_col_large_csv(all_parsers, monkeypatch): } ) - with tm.ensure_clean() as path: - df.to_csv(path, index=False) - with monkeypatch.context() as m: - m.setattr("pandas.core.algorithms._MINIMUM_COMP_ARR_LEN", ARR_LEN) - result = parser.read_csv(path, index_col=[0]) + df.to_csv(temp_file, index=False) + with monkeypatch.context() as m: + m.setattr("pandas.core.algorithms._MINIMUM_COMP_ARR_LEN", ARR_LEN) + result = parser.read_csv(temp_file, index_col=[0]) tm.assert_frame_equal(result, df.set_index("a")) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 348c19ac0f0c6..a0ccae5a84941 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -127,7 +127,7 @@ def reader(arg): @xfail_pyarrow # ValueError: The 'nrows' option is not supported -def test_multi_thread_path_multipart_read_csv(all_parsers): +def test_multi_thread_path_multipart_read_csv(tmp_path, all_parsers): # see gh-11786 num_tasks = 4 num_rows = 48 @@ -149,10 +149,10 @@ def test_multi_thread_path_multipart_read_csv(all_parsers): } ) - with tm.ensure_clean(file_name) as path: - df.to_csv(path) + path = tmp_path / file_name + df.to_csv(path) - result = _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks) + result = _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks) expected = df[:] expected["date"] = expected["date"].astype("M8[s]") diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 4ced43b6463e1..beb5e8d9d996c 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -86,7 +86,7 @@ def test_date_col_as_index_col(all_parsers): @xfail_pyarrow -def test_nat_parse(all_parsers): +def test_nat_parse(all_parsers, temp_file): # see gh-3062 parser = all_parsers df = DataFrame( @@ -97,11 +97,11 @@ def test_nat_parse(all_parsers): ) df.iloc[3:6, :] = np.nan - with tm.ensure_clean("__nat_parse_.csv") as path: - df.to_csv(path) + path = temp_file + df.to_csv(path) - result = parser.read_csv(path, index_col=0, parse_dates=["B"]) - tm.assert_frame_equal(result, df) + result = parser.read_csv(path, index_col=0, parse_dates=["B"]) + tm.assert_frame_equal(result, df) @skip_pyarrow diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index eed2403a88922..0de65ab889be8 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -158,7 +158,9 @@ def test_skipfooter(python_parser_only, kwargs): @pytest.mark.parametrize( "compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")] ) -def test_decompression_regex_sep(python_parser_only, csv1, compression, klass): +def test_decompression_regex_sep( + temp_file, python_parser_only, csv1, compression, klass +): # see gh-6607 parser = python_parser_only @@ -171,12 +173,11 @@ def test_decompression_regex_sep(python_parser_only, csv1, compression, klass): module = pytest.importorskip(compression) klass = getattr(module, klass) - with tm.ensure_clean() as path: - with klass(path, mode="wb") as tmp: - tmp.write(data) + with klass(temp_file, mode="wb") as tmp: + tmp.write(data) - result = parser.read_csv(path, sep="::", compression=compression) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(temp_file, sep="::", compression=compression) + tm.assert_frame_equal(result, expected) def test_read_csv_buglet_4x_multi_index(python_parser_only): diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 6243185294894..4b78048a3a073 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -8,7 +8,6 @@ BytesIO, StringIO, ) -from pathlib import Path import numpy as np import pytest @@ -642,7 +641,7 @@ def test_default_delimiter(): @pytest.mark.parametrize("infer", [True, False]) -def test_fwf_compression(compression_only, infer, compression_to_extension): +def test_fwf_compression(compression_only, infer, compression_to_extension, temp_file): data = """1111111111 2222222222 3333333333""".strip() @@ -655,17 +654,17 @@ def test_fwf_compression(compression_only, infer, compression_to_extension): data = bytes(data, encoding="utf-8") - with tm.ensure_clean(filename="tmp." + extension) as path: - tm.write_to_compressed(compression, path, data) + path = temp_file.parent / f"tmp.{extension}" + tm.write_to_compressed(compression, path, data) - if infer is not None: - kwargs["compression"] = "infer" if infer else compression + if infer is not None: + kwargs["compression"] = "infer" if infer else compression - result = read_fwf(path, **kwargs) - tm.assert_frame_equal(result, expected) + result = read_fwf(path, **kwargs) + tm.assert_frame_equal(result, expected) -def test_binary_mode(): +def test_binary_mode(temp_file): """ read_fwf supports opening files in binary mode. @@ -676,31 +675,31 @@ def test_binary_mode(): df_reference = DataFrame( [["bba", "bab", "b a"]], columns=["aaa", "aaa.1", "aaa.2"], index=[0] ) - with tm.ensure_clean() as path: - Path(path).write_text(data, encoding="utf-8") - with open(path, "rb") as file: - df = read_fwf(file) - file.seek(0) - tm.assert_frame_equal(df, df_reference) + path = temp_file + path.write_text(data, encoding="utf-8") + with open(path, "rb") as file: + df = read_fwf(file) + file.seek(0) + tm.assert_frame_equal(df, df_reference) @pytest.mark.parametrize("memory_map", [True, False]) -def test_encoding_mmap(memory_map): +def test_encoding_mmap(memory_map, temp_file): """ encoding should be working, even when using a memory-mapped file. GH 23254. """ encoding = "iso8859_1" - with tm.ensure_clean() as path: - Path(path).write_bytes(" 1 A Ä 2\n".encode(encoding)) - df = read_fwf( - path, - header=None, - widths=[2, 2, 2, 2], - encoding=encoding, - memory_map=memory_map, - ) + path = temp_file + path.write_bytes(" 1 A Ä 2\n".encode(encoding)) + df = read_fwf( + path, + header=None, + widths=[2, 2, 2, 2], + encoding=encoding, + memory_map=memory_map, + ) df_reference = DataFrame([[1, "A", "Ä", 2]]) tm.assert_frame_equal(df, df_reference) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 07f84466e3ac2..8812a65ee4e7d 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -167,7 +167,7 @@ def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers): parser.read_csv(sio, on_bad_lines=bad_lines_func) -def test_close_file_handle_on_invalid_usecols(all_parsers): +def test_close_file_handle_on_invalid_usecols(all_parsers, temp_file): # GH 45384 parser = all_parsers @@ -176,13 +176,13 @@ def test_close_file_handle_on_invalid_usecols(all_parsers): # Raises pyarrow.lib.ArrowKeyError pytest.skip(reason="https://github.com/apache/arrow/issues/38676") - with tm.ensure_clean("test.csv") as fname: - Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8") - with tm.assert_produces_warning(False): - with pytest.raises(error, match="col3"): - parser.read_csv(fname, usecols=["col1", "col2", "col3"]) - # unlink fails on windows if file handles still point to it - os.unlink(fname) + fname = temp_file + Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8") + with tm.assert_produces_warning(False): + with pytest.raises(error, match="col3"): + parser.read_csv(fname, usecols=["col1", "col2", "col3"]) + # unlink fails on windows if file handles still point to it + os.unlink(fname) def test_invalid_file_inputs(request, all_parsers): diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index 23932b6092998..37e6eeb05deec 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -11,7 +11,6 @@ HDFStore, Index, MultiIndex, - _testing as tm, date_range, read_hdf, ) @@ -182,16 +181,16 @@ def test_append_with_diff_col_name_types_raises_value_error(setup_path): store.append(name, d) -def test_invalid_complib(setup_path): +def test_invalid_complib(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).random((4, 5)), index=list("abcd"), columns=list("ABCDE"), ) - with tm.ensure_clean(setup_path) as path: - msg = r"complib only supports \[.*\] compression." - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, key="df", complib="foolib") + path = tmp_path / setup_path + msg = r"complib only supports \[.*\] compression." + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, key="df", complib="foolib") @pytest.mark.parametrize( diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 295c3b72ba5e2..2ea2ac632c992 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -161,30 +161,28 @@ def test_reopen_handle(tmp_path, setup_path): assert not store.is_open -def test_open_args(setup_path, using_infer_string): - with tm.ensure_clean(setup_path) as path: - df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), - ) +def test_open_args(tmp_path, setup_path, using_infer_string): + path = tmp_path / setup_path + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) - # create an in memory store - store = HDFStore( - path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0 - ) - store["df"] = df - store.append("df2", df) + # create an in memory store + store = HDFStore(path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0) + store["df"] = df + store.append("df2", df) - expected = df.copy() - if using_infer_string: - expected.index = expected.index.astype("str") - expected.columns = expected.columns.astype("str") + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") - tm.assert_frame_equal(store["df"], expected) - tm.assert_frame_equal(store["df2"], expected) + tm.assert_frame_equal(store["df"], expected) + tm.assert_frame_equal(store["df2"], expected) - store.close() + store.close() # the file should not have actually been written assert not os.path.exists(path) @@ -507,7 +505,7 @@ def test_multiple_open_close(tmp_path, setup_path): store.df -def test_fspath(): - with tm.ensure_clean("foo.h5") as path: - with HDFStore(path) as store: - assert os.fspath(store) == str(path) +def test_fspath(tmp_path): + path = tmp_path / "foo.h5" + with HDFStore(path) as store: + assert os.fspath(store) == str(path) diff --git a/pandas/tests/io/pytables/test_pytables_missing.py b/pandas/tests/io/pytables/test_pytables_missing.py index 8d9d3afc4ad6f..b46028f936e94 100644 --- a/pandas/tests/io/pytables/test_pytables_missing.py +++ b/pandas/tests/io/pytables/test_pytables_missing.py @@ -3,12 +3,11 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas._testing as tm @td.skip_if_installed("tables") -def test_pytables_raises(): +def test_pytables_raises(tmp_path): df = pd.DataFrame({"A": [1, 2]}) + path = tmp_path / "foo.h5" with pytest.raises(ImportError, match="tables"): - with tm.ensure_clean("foo.h5") as path: - df.to_hdf(path, key="df") + df.to_hdf(path, key="df") diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 409b92d2ddde1..37e3e9d4f9db2 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -27,33 +27,31 @@ pytestmark = [pytest.mark.single_cpu] -def test_conv_read_write(): - with tm.ensure_clean() as path: +def test_conv_read_write(temp_file): + def roundtrip(key, obj, **kwargs): + obj.to_hdf(temp_file, key=key, **kwargs) + return read_hdf(temp_file, key) - def roundtrip(key, obj, **kwargs): - obj.to_hdf(path, key=key, **kwargs) - return read_hdf(path, key) - - o = Series( - np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) - ) - tm.assert_series_equal(o, roundtrip("series", o)) + o = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + tm.assert_series_equal(o, roundtrip("series", o)) - o = Series(range(10), dtype="float64", index=[f"i_{i}" for i in range(10)]) - tm.assert_series_equal(o, roundtrip("string_series", o)) + o = Series(range(10), dtype="float64", index=[f"i_{i}" for i in range(10)]) + tm.assert_series_equal(o, roundtrip("string_series", o)) - o = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD")), - index=Index([f"i-{i}" for i in range(30)]), - ) - tm.assert_frame_equal(o, roundtrip("frame", o)) + o = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), + ) + tm.assert_frame_equal(o, roundtrip("frame", o)) - # table - df = DataFrame({"A": range(5), "B": range(5)}) - df.to_hdf(path, key="table", append=True) - result = read_hdf(path, "table", where=["index>2"]) - tm.assert_frame_equal(df[df.index > 2], result) + # table + df = DataFrame({"A": range(5), "B": range(5)}) + df.to_hdf(temp_file, key="table", append=True) + result = read_hdf(temp_file, "table", where=["index>2"]) + tm.assert_frame_equal(df[df.index > 2], result) def test_long_strings(setup_path): diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 2c193c968e2b5..1b473993496a5 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -51,7 +51,7 @@ def test_orc_reader_empty(dirpath, using_infer_string): "str" if using_infer_string else "object", ] expected = pd.DataFrame(index=pd.RangeIndex(0)) - for colname, dtype in zip(columns, dtypes): + for colname, dtype in zip(columns, dtypes, strict=True): expected[colname] = pd.Series(dtype=dtype) expected.columns = expected.columns.astype("str") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1b9ae5d8e7209..5865c46b4031e 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -748,10 +748,9 @@ def postgresql_psycopg2_conn_types(postgresql_psycopg2_engine_types): @pytest.fixture -def sqlite_str(): +def sqlite_str(temp_file): pytest.importorskip("sqlalchemy") - with tm.ensure_clean() as name: - yield f"sqlite:///{name}" + return f"sqlite:///{temp_file}" @pytest.fixture @@ -817,20 +816,19 @@ def sqlite_conn_types(sqlite_engine_types): @pytest.fixture -def sqlite_adbc_conn(): +def sqlite_adbc_conn(temp_file): pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_sqlite") from adbc_driver_sqlite import dbapi - with tm.ensure_clean() as name: - uri = f"file:{name}" - with dbapi.connect(uri) as conn: - yield conn - for view in get_all_views(conn): - drop_view(view, conn) - for tbl in get_all_tables(conn): - drop_table(tbl, conn) - conn.commit() + uri = f"file:{temp_file}" + with dbapi.connect(uri) as conn: + yield conn + for view in get_all_views(conn): + drop_view(view, conn) + for tbl in get_all_tables(conn): + drop_table(tbl, conn) + conn.commit() @pytest.fixture @@ -2504,20 +2502,20 @@ def test_sqlalchemy_integer_overload_mapping(conn, request, integer): sql.SQLTable("test_type", db, frame=df) -def test_database_uri_string(request, test_frame1): +def test_database_uri_string(temp_file, request, test_frame1): pytest.importorskip("sqlalchemy") # Test read_sql and .to_sql method with a database URI (GH10654) # db_uri = 'sqlite:///:memory:' # raises # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near # "iris": syntax error [SQL: 'iris'] - with tm.ensure_clean() as name: - db_uri = "sqlite:///" + name - table = "iris" - test_frame1.to_sql(name=table, con=db_uri, if_exists="replace", index=False) - test_frame2 = sql.read_sql(table, db_uri) - test_frame3 = sql.read_sql_table(table, db_uri) - query = "SELECT * FROM iris" - test_frame4 = sql.read_sql_query(query, db_uri) + name = str(temp_file) + db_uri = "sqlite:///" + name + table = "iris" + test_frame1.to_sql(name=table, con=db_uri, if_exists="replace", index=False) + test_frame2 = sql.read_sql(table, db_uri) + test_frame3 = sql.read_sql_table(table, db_uri) + query = "SELECT * FROM iris" + test_frame4 = sql.read_sql_query(query, db_uri) tm.assert_frame_equal(test_frame1, test_frame2) tm.assert_frame_equal(test_frame1, test_frame3) tm.assert_frame_equal(test_frame1, test_frame4) @@ -2581,16 +2579,15 @@ def test_column_with_percentage(conn, request): tm.assert_frame_equal(res, df) -def test_sql_open_close(test_frame3): +def test_sql_open_close(temp_file, test_frame3): # Test if the IO in the database still work if the connection closed # between the writing and reading (as in many real situations). - with tm.ensure_clean() as name: - with contextlib.closing(sqlite3.connect(name)) as conn: - assert sql.to_sql(test_frame3, "test_frame3_legacy", conn, index=False) == 4 + with contextlib.closing(sqlite3.connect(temp_file)) as conn: + assert sql.to_sql(test_frame3, "test_frame3_legacy", conn, index=False) == 4 - with contextlib.closing(sqlite3.connect(name)) as conn: - result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) + with contextlib.closing(sqlite3.connect(temp_file)) as conn: + result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) tm.assert_frame_equal(test_frame3, result) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d3bef4c863b28..c38ee32cb7226 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3097,3 +3097,27 @@ def test_merge_categorical_key_recursion(): right.astype("float64"), on="key", how="outer" ) tm.assert_frame_equal(result, expected) + + +def test_merge_pyarrow_datetime_duplicates(): + # GH#61926 + pytest.importorskip("pyarrow") + + t = pd.date_range("2025-07-06", periods=3, freq="h") + df1 = DataFrame({"time": t, "val1": [1, 2, 3]}) + df1 = df1.convert_dtypes(dtype_backend="pyarrow") + + df2 = DataFrame({"time": t.repeat(2), "val2": [10, 20, 30, 40, 50, 60]}) + df2 = df2.convert_dtypes(dtype_backend="pyarrow") + + result = merge(df1, df2, on="time", how="left") + + expected = DataFrame( + { + "time": t.repeat(2), + "val1": [1, 1, 2, 2, 3, 3], + "val2": [10, 20, 30, 40, 50, 60], + } + ) + expected = expected.convert_dtypes(dtype_backend="pyarrow") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index ef034e62bb764..57d0c60118504 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -209,11 +209,14 @@ def test_convert_dtypes( "convert_boolean", "convert_floating", ] - params_dict = dict(zip(param_names, params)) + params_dict = dict(zip(param_names, params, strict=True)) expected_dtype = expected_default for spec, dtype in expected_other.items(): - if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])): + if all( + params_dict[key] is val + for key, val in zip(spec[::2], spec[1::2], strict=False) + ): expected_dtype = dtype if ( using_infer_string diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index f5a97d61990a4..c912b9abd22ca 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -21,7 +21,7 @@ def test_rename(self, datetime_series): assert renamed.index[0] == renamer(ts.index[0]) # dict - rename_dict = dict(zip(ts.index, renamed.index)) + rename_dict = dict(zip(ts.index, renamed.index, strict=True)) renamed2 = ts.rename(rename_dict) tm.assert_series_equal(renamed, renamed2) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 7d068c2120735..f7383dd967cbe 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -648,7 +648,7 @@ def test_replace_different_int_types(self, any_int_numpy_dtype): labs = pd.Series([1, 1, 1, 0, 0, 2, 2, 2], dtype=any_int_numpy_dtype) maps = pd.Series([0, 2, 1], dtype=any_int_numpy_dtype) - map_dict = dict(zip(maps.values, maps.index)) + map_dict = dict(zip(maps.values, maps.index, strict=True)) result = labs.replace(map_dict) expected = labs.replace({0: 0, 2: 1, 1: 2}) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index d42aafc001680..3ff39b41a352d 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -146,7 +146,7 @@ def test_reset_index_with_drop(self): ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] - tuples = zip(*arrays) + tuples = zip(*arrays, strict=True) index = MultiIndex.from_tuples(tuples) data = np.random.default_rng(2).standard_normal(8) ser = Series(data, index=index) diff --git a/pandas/tests/series/methods/test_sort_index.py b/pandas/tests/series/methods/test_sort_index.py index d6817aa179b7b..861017f448195 100644 --- a/pandas/tests/series/methods/test_sort_index.py +++ b/pandas/tests/series/methods/test_sort_index.py @@ -186,7 +186,7 @@ def test_sort_index_ascending_list(self): ["one", "two", "one", "two", "one", "two", "one", "two"], [4, 3, 2, 1, 4, 3, 2, 1], ] - tuples = zip(*arrays) + tuples = zip(*arrays, strict=True) mi = MultiIndex.from_tuples(tuples, names=["first", "second", "third"]) ser = Series(range(8), index=mi) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index c5414022e664b..11fc3034cf290 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -110,3 +110,19 @@ def test_string_array_extract(nullable_string_dtype): result = result.astype(object) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "values, width, expected", + [ + (["a", "ab", "abc", None], 4, ["000a", "00ab", "0abc", None]), + (["1", "-1", "+1", None], 4, ["0001", "-001", "+001", None]), + (["1234", "-1234"], 3, ["1234", "-1234"]), + ], +) +def test_string_array_zfill(nullable_string_dtype, values, width, expected): + # GH #61485 + s = Series(values, dtype=nullable_string_dtype) + result = s.str.zfill(width) + expected = Series(expected, dtype=nullable_string_dtype) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index eddfeb80967ef..f59339cacd8d8 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3790,3 +3790,14 @@ def test_to_datetime_wrapped_datetime64_ps(): ["1970-01-01 00:00:01.901901901"], dtype="datetime64[ns]", freq=None ) tm.assert_index_equal(result, expected) + + +def test_to_datetime_lxml_elementunicoderesult_with_format(cache): + etree = pytest.importorskip("lxml.etree") + + s = "2025-02-05 16:59:57" + node = etree.XML(f"{s}") + val = node.xpath("/date/node()")[0] # _ElementUnicodeResult + + out = to_datetime(Series([val]), format="%Y-%m-%d %H:%M:%S", cache=cache) + assert out.iloc[0] == Timestamp(s) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 26b182fb4e9b1..28badd877fccb 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1157,6 +1157,11 @@ def test_offset_multiplication( tm.assert_series_equal(resultarray, expectedarray) +def test_offset_deprecated_error(): + with pytest.raises(ValueError, match="Did you mean h"): + date_range("2012-01-01", periods=3, freq="H") + + def test_dateoffset_operations_on_dataframes(performance_warning): # GH 47953 df = DataFrame({"T": [Timestamp("2019-04-30")], "D": [DateOffset(months=1)]}) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 6c134a11cb530..fc5ffa24980f5 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -25,7 +25,10 @@ # Usually we wouldn't want this import in this test file (which is targeted at # tslibs.parsing), but it is convenient to test the Timestamp constructor at # the same time as the other parsing functions. -from pandas import Timestamp +from pandas import ( + Timestamp, + option_context, +) import pandas._testing as tm from pandas._testing._hypothesis import DATETIME_NO_TZ @@ -422,3 +425,40 @@ def test_hypothesis_delimited_date( assert except_out_dateutil == except_in_dateutil assert result == expected + + +@pytest.mark.parametrize("input", ["21-01-01", "01-01-21"]) +@pytest.mark.parametrize("dayfirst", [True, False]) +def test_parse_datetime_string_with_reso_dayfirst(dayfirst, input): + with option_context("display.date_dayfirst", dayfirst): + except_out_dateutil, result = _helper_hypothesis_delimited_date( + parsing.parse_datetime_string_with_reso, input + ) + + except_in_dateutil, expected = _helper_hypothesis_delimited_date( + du_parse, + input, + default=datetime(1, 1, 1), + dayfirst=dayfirst, + yearfirst=False, + ) + assert except_out_dateutil == except_in_dateutil + assert result[0] == expected + + +@pytest.mark.parametrize("input", ["21-01-01", "01-01-21"]) +@pytest.mark.parametrize("yearfirst", [True, False]) +def test_parse_datetime_string_with_reso_yearfirst(yearfirst, input): + with option_context("display.date_yearfirst", yearfirst): + except_out_dateutil, result = _helper_hypothesis_delimited_date( + parsing.parse_datetime_string_with_reso, input + ) + except_in_dateutil, expected = _helper_hypothesis_delimited_date( + du_parse, + input, + default=datetime(1, 1, 1), + dayfirst=False, + yearfirst=yearfirst, + ) + assert except_out_dateutil == except_in_dateutil + assert result[0] == expected diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 8711365a19214..a040662432bc4 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -413,3 +413,12 @@ def test_datetimelike_compat_deprecated(): tm.assert_series_equal(df["a"], df["a"], check_datetimelike_compat=True) with tm.assert_produces_warning(Pandas4Warning, match=msg): tm.assert_series_equal(df["a"], df["a"], check_datetimelike_compat=False) + + +def test_assert_frame_equal_nested_df_na(): + # GH#43022 + inner = DataFrame({"a": [1, pd.NA]}) + df1 = DataFrame({"df": [inner]}) + df2 = DataFrame({"df": [inner]}) + + tm.assert_frame_equal(df1, df2) diff --git a/pyproject.toml b/pyproject.toml index f74f509f61692..1e5c51866d0cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -391,6 +391,10 @@ ignore = [ "PLW0603", # runtime-cast-value "TC006", + # unused-unpacked-variable + "RUF059", + # pytest-raises-ambiguous-pattern + "RUF043", ] exclude = [ @@ -441,19 +445,9 @@ exclude = [ "pandas/_config/config.py" = ["B905"] "pandas/conftest.py" = ["B905"] "pandas/core/array_algos/quantile.py" = ["B905"] -"pandas/core/arrays/arrow/array.py" = ["B905"] -"pandas/core/arrays/base.py" = ["B905"] -"pandas/core/arrays/categorical.py" = ["B905"] -"pandas/core/arrays/datetimelike.py" = ["B905"] -"pandas/core/arrays/interval.py" = ["B905"] -"pandas/core/arrays/masked.py" = ["B905"] -"pandas/core/arrays/period.py" = ["B905"] -"pandas/core/arrays/sparse/array.py" = ["B905"] -"pandas/core/arrays/timedeltas.py" = ["B905"] "pandas/core/computation/align.py" = ["B905"] "pandas/core/computation/expr.py" = ["B905"] "pandas/core/computation/ops.py" = ["B905"] -"pandas/core/dtypes/missing.py" = ["B905"] "pandas/core/groupby/generic.py" = ["B905"] "pandas/core/groupby/groupby.py" = ["B905"] "pandas/core/groupby/grouper.py" = ["B905"] @@ -467,7 +461,6 @@ exclude = [ "pandas/core/reshape/merge.py" = ["B905"] "pandas/core/reshape/pivot.py" = ["B905"] "pandas/core/reshape/reshape.py" = ["B905"] -"pandas/core/strings/accessor.py" = ["B905"] "pandas/core/window/rolling.py" = ["B905"] "pandas/_testing/asserters.py" = ["B905"] "pandas/_testing/_warnings.py" = ["B905"]