Merge remote-tracking branch 'upstream/main' into cow_array_eas

# Conflicts: # pandas/tests/copy_view/test_array.py
phofl · Mar 29, 2023 · 42368e0 · 42368e0
2 parents 2d8a2f2 + 2736ec8
commit 42368e0
Show file tree

Hide file tree

Showing 317 changed files with 5,815 additions and 3,967 deletions.
diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh
@@ -55,8 +55,7 @@ if pip list | grep -q ^pandas; then
 fi
 
 echo "Build extensions"
-# GH 47305: Parallel build can causes flaky ImportError from pandas/_libs/tslibs
-python setup.py build_ext -q -j1
+python setup.py build_ext -q -j4
 
 echo "Install pandas"
 python -m pip install --no-build-isolation --no-use-pep517 -e .

diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml
@@ -16,7 +16,5 @@ runs:
         python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index
       shell: bash -el {0}
       env:
-        # Cannot use parallel compilation on Windows, see https://github.com/pandas-dev/pandas/issues/30873
-        # GH 47305: Parallel build causes flaky ImportError: /home/runner/work/pandas/pandas/pandas/_libs/tslibs/timestamps.cpython-38-x86_64-linux-gnu.so: undefined symbol: pandas_datetime_to_datetimestruct
-        N_JOBS: 1
-        #N_JOBS: ${{ runner.os == 'Windows' && 1 || 2 }}
+        # https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources
+        N_JOBS: ${{ runner.os == 'macOS' && 3 || 2 }}
diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml
@@ -30,7 +30,7 @@ runs:
         environment-name: ${{ inputs.environment-name }}
         extra-specs: ${{ inputs.extra-specs }}
         channels: conda-forge
-        channel-priority: ${{ runner.os == 'macOS' && 'flexible' || 'strict' }}
+        channel-priority: 'strict'
         condarc-file: ci/condarc.yml
         cache-env: true
         cache-downloads: true
diff --git a/.github/workflows/32-bit-linux.yml b/.github/workflows/32-bit-linux.yml
@@ -40,7 +40,7 @@ jobs:
           python -m pip install --no-deps -U pip wheel 'setuptools<60.0.0' && \
           python -m pip install versioneer[toml] && \
           python -m pip install cython numpy python-dateutil pytz pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.34.2 && \
-          python setup.py build_ext -q -j1 && \
+          python setup.py build_ext -q -j$(nproc) && \
           python -m pip install --no-build-isolation --no-use-pep517 -e . && \
           python -m pip list && \
           export PANDAS_CI=1 && \

diff --git a/.github/workflows/dependabot.yml b/.github/workflows/dependabot.yml
@@ -0,0 +1,9 @@
+version: 2
+updates:
+  - package-ecosystem: github-actions
+    directory: /
+    schedule:
+      interval: weekly
+    labels:
+      - "CI"
+      - "Dependencies"
diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml
@@ -20,7 +20,7 @@ jobs:
     runs-on: ubuntu-22.04
     strategy:
       matrix:
-        extra: ["test", "performance", "timezone", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output_formatting", "clipboard", "compression", "all"]
+        extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output_formatting", "clipboard", "compression", "all"]
       fail-fast: false
     name: Install Extras - ${{ matrix.extra }}
     concurrency:

diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml
@@ -82,10 +82,9 @@ jobs:
         python -m pip install python-dateutil pytz cython hypothesis>=6.34.2 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17
         python -m pip list
 
-    # GH 47305: Parallel build can cause flaky ImportError from pandas/_libs/tslibs
     - name: Build Pandas
       run: |
-        python setup.py build_ext -q -j1
+        python setup.py build_ext -q -j4
         python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index
 
     - name: Build Version

diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
@@ -28,7 +28,6 @@ jobs:
         env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml]
         pattern: ["not single_cpu", "single_cpu"]
         pyarrow_version: ["8", "9", "10"]
-        pandas_ci: [1]
         include:
           - name: "Downstream Compat"
             env_file: actions-38-downstream_compat.yaml
@@ -75,7 +74,7 @@ jobs:
             test_args: "-W error::DeprecationWarning -W error::FutureWarning"
             # TODO(cython3): Re-enable once next-beta(after beta 1) comes out
             # There are some warnings failing the build with -werror
-            pandas_ci: 0
+            pandas_ci: "0"
         exclude:
           - env_file: actions-38.yaml
             pyarrow_version: "8"
@@ -99,7 +98,7 @@ jobs:
       LC_ALL: ${{ matrix.lc_all || '' }}
       PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }}
       PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }}
-      PANDAS_CI: ${{ matrix.pandas_ci }}
+      PANDAS_CI: ${{ matrix.pandas_ci || '1' }}
       TEST_ARGS: ${{ matrix.test_args || '' }}
       PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }}
       PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}

diff --git a/.gitignore b/.gitignore
@@ -53,6 +53,9 @@ dist
 # type checkers
 pandas/py.typed
 
+# pyenv
+.python-version
+
 # tox testing tool
 .tox
 # rope

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -28,7 +28,7 @@ repos:
         types_or: [python, pyi]
         additional_dependencies: [black==23.1.0]
 -   repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.0.253
+    rev: v0.0.259
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
@@ -392,14 +392,6 @@ repos:
         files: ^pandas/
         exclude: ^(pandas/_libs/|pandas/tests/|pandas/errors/__init__.py$|pandas/_version.py)
         types: [python]
-    -   id: flake8-pyi
-        name: flake8-pyi
-        entry: flake8 --extend-ignore=E301,E302,E305,E701,E704
-        types: [pyi]
-        language: python
-        additional_dependencies:
-        - flake8==5.0.4
-        - flake8-pyi==22.8.1
     -   id: future-annotations
         name: import annotations from __future__
         entry: 'from __future__ import annotations'
@@ -421,8 +413,8 @@ repos:
         language: python
         stages: [manual]
         additional_dependencies:
-        - autotyping==22.9.0
-        - libcst==0.4.7
+        - autotyping==23.3.0
+        - libcst==0.4.9
     -   id: check-test-naming
         name: check that test names start with 'test'
         entry: python -m scripts.check_test_naming

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -58,3 +58,5 @@ prune pandas/tests/io/parser/data
 # Selectively re-add *.cxx files that were excluded above
 graft pandas/_libs/src
 graft pandas/_libs/tslibs/src
+include pandas/_libs/pd_parser.h
+include pandas/_libs/pd_parser.c
diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
@@ -266,10 +266,14 @@ def setup(self, tz):
         self.ts = self.s[halfway]
 
         self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz))
+        self.ts_different_reso = Timestamp("2001-01-02", tz=tz)
 
     def time_series_timestamp_compare(self, tz):
         self.s <= self.ts
 
+    def time_series_timestamp_different_reso_compare(self, tz):
+        self.s <= self.ts_different_reso
+
     def time_timestamp_series_compare(self, tz):
         self.ts >= self.s
 

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -34,7 +34,6 @@ def setup(self, dtype):
 
         # GH37371. Testing construction of string series/frames from ExtensionArrays
         self.series_cat_arr = Categorical(self.series_arr)
-        self.frame_cat_arr = Categorical(self.frame_arr)
 
     def time_series_construction(self, dtype):
         Series(self.series_arr, dtype=dtype)
@@ -54,12 +53,6 @@ def time_cat_series_construction(self, dtype):
     def peakmem_cat_series_construction(self, dtype):
         Series(self.series_cat_arr, dtype=dtype)
 
-    def time_cat_frame_construction(self, dtype):
-        DataFrame(self.frame_cat_arr, dtype=dtype)
-
-    def peakmem_cat_frame_construction(self, dtype):
-        DataFrame(self.frame_cat_arr, dtype=dtype)
-
 
 class Methods(Dtypes):
     def time_center(self, dtype):

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -86,8 +86,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
     MSG='Partially validate docstrings (EX01)' ;  echo $MSG
     $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01 --ignore_functions \
         pandas.Series.index \
-        pandas.Series.hasnans \
-        pandas.Series.to_list \
         pandas.Series.__iter__ \
         pandas.Series.keys \
         pandas.Series.item \
@@ -97,6 +95,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.Series.is_monotonic_increasing \
         pandas.Series.is_monotonic_decreasing \
         pandas.Series.backfill \
+        pandas.Series.bfill \
+        pandas.Series.ffill \
         pandas.Series.pad \
         pandas.Series.argsort \
         pandas.Series.reorder_levels \
@@ -307,7 +307,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas_object \
         pandas.api.interchange.from_dataframe \
         pandas.Index.values \
-        pandas.Index.hasnans \
         pandas.Index.dtype \
         pandas.Index.inferred_type \
         pandas.Index.shape \
@@ -541,14 +540,14 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.DataFrame.iterrows \
         pandas.DataFrame.pipe \
         pandas.DataFrame.backfill \
+        pandas.DataFrame.bfill \
+        pandas.DataFrame.ffill \
         pandas.DataFrame.pad \
         pandas.DataFrame.swapaxes \
         pandas.DataFrame.first_valid_index \
         pandas.DataFrame.last_valid_index \
         pandas.DataFrame.attrs \
         pandas.DataFrame.plot \
-        pandas.DataFrame.sparse.density \
-        pandas.DataFrame.sparse.to_coo \
         pandas.DataFrame.to_gbq \
         pandas.DataFrame.style \
         pandas.DataFrame.__dataframe__

diff --git a/ci/deps/actions-310-numpydev.yaml b/ci/deps/actions-310-numpydev.yaml
@@ -18,9 +18,11 @@ dependencies:
   - python-dateutil
   - pytz
   - pip
+
   - pip:
     - "cython"
     - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple"
     - "--pre"
     - "numpy"
     - "scipy"
+    - "tzdata>=2022.1"
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
@@ -49,8 +49,10 @@ dependencies:
   - scipy>=1.7.1
   - sqlalchemy>=1.4.16
   - tabulate>=0.8.9
-  - tzdata>=2022a
   - xarray>=0.21.0
   - xlrd>=2.0.1
   - xlsxwriter>=1.4.3
   - zstandard>=0.15.2
+
+  - pip:
+    - tzdata>=2022.1
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
@@ -49,8 +49,10 @@ dependencies:
   - scipy>=1.7.1
   - sqlalchemy>=1.4.16
   - tabulate>=0.8.9
-  - tzdata>=2022a
   - xarray>=0.21.0
   - xlrd>=2.0.1
   - xlsxwriter>=1.4.3
   - zstandard>=0.15.2
+
+  - pip:
+    - tzdata>=2022.1
diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml
@@ -68,3 +68,6 @@ dependencies:
   - pandas-gbq>=0.15.0
   - pyyaml
   - py
+
+  - pip:
+    - tzdata>=2022.1
diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml
@@ -52,11 +52,11 @@ dependencies:
   - scipy=1.7.1
   - sqlalchemy=1.4.16
   - tabulate=0.8.9
-  - tzdata=2022a
   - xarray=0.21.0
   - xlrd=2.0.1
   - xlsxwriter=1.4.3
   - zstandard=0.15.2
 
   - pip:
     - pyqt5==5.15.1
+    - tzdata==2022.1
diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml
@@ -53,3 +53,6 @@ dependencies:
   - xlrd>=2.0.1
   - xlsxwriter>=1.4.3
   - zstandard>=0.15.2
+
+  - pip:
+    - tzdata>=2022.1
diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -49,8 +49,10 @@ dependencies:
   - scipy>=1.7.1
   - sqlalchemy>=1.4.16
   - tabulate>=0.8.9
-  - tzdata>=2022a
   - xarray>=0.21.0
   - xlrd>=2.0.1
   - xlsxwriter>=1.4.3
   - zstandard>=0.15.2
+
+  - pip:
+    - tzdata>=2022.1
diff --git a/ci/deps/actions-pypy-38.yaml b/ci/deps/actions-pypy-38.yaml
@@ -22,3 +22,6 @@ dependencies:
   - numpy
   - python-dateutil
   - pytz
+
+  - pip:
+    - tzdata>=2022.1
diff --git a/ci/test_wheels_windows.bat b/ci/test_wheels_windows.bat
@@ -3,7 +3,7 @@ pd.test(extra_args=['-m not clipboard and not single_cpu and not slow and not ne
 pd.test(extra_args=['-m not clipboard and single_cpu and not slow and not network and not db'])
 
 python --version
-pip install pytz six numpy python-dateutil
+pip install pytz six numpy python-dateutil tzdata>=2022.1
 pip install hypothesis>=6.34.2 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17
 pip install --find-links=pandas/dist --no-index pandas
 python -c "%test_command%"
diff --git a/doc/source/_static/reshaping_pivot.png b/doc/source/_static/reshaping_pivot.png
diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -101,20 +101,20 @@
         reldir = os.path.relpath(dirname, source_path)
         for fname in fnames:
             if os.path.splitext(fname)[-1] in (".rst", ".ipynb"):
-                fname = os.path.relpath(os.path.join(dirname, fname), source_path)
+                rel_fname = os.path.relpath(os.path.join(dirname, fname), source_path)
 
-                if fname == "index.rst" and os.path.abspath(dirname) == source_path:
+                if rel_fname == "index.rst" and os.path.abspath(dirname) == source_path:
                     continue
                 if pattern == "-api" and reldir.startswith("reference"):
-                    exclude_patterns.append(fname)
+                    exclude_patterns.append(rel_fname)
                 elif (
                     pattern == "whatsnew"
                     and not reldir.startswith("reference")
                     and reldir != "whatsnew"
                 ):
-                    exclude_patterns.append(fname)
-                elif single_doc and fname != pattern:
-                    exclude_patterns.append(fname)
+                    exclude_patterns.append(rel_fname)
+                elif single_doc and rel_fname != pattern:
+                    exclude_patterns.append(rel_fname)
 
 with open(os.path.join(source_path, "index.rst.template")) as f:
     t = jinja2.Template(f.read())

diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst
@@ -111,9 +111,11 @@ contributing to pandas. The slack is a private space, specifically meant for
 people who are hesitant to bring up their questions or ideas on a large public
 mailing list or GitHub.
 
-If this sounds like the right place for you, you are welcome to join! Email us
-at `slack@pandas.pydata.org <mailto://slack@pandas.pydata.org>`_ and let us
-know that you read and agree to our `Code of Conduct <https://pandas.pydata.org/community/coc.html>`_
-😉 to get an invite. And please remember that slack is not meant to replace the
-mailing list or issue tracker - all important announcements and conversations
-should still happen there.
+If this sounds like the right place for you, you are welcome to join using
+`this link <https://join.slack.com/t/pandas-dev-community/shared_invite/zt-1e2qgy1r6-PLCN8UOLEUAYoLdAsaJilw>`_!
+Please remember to follow our `Code of Conduct <https://pandas.pydata.org/community/coc.html>`_,
+and be aware that our admins are monitoring for irrelevant messages and will remove folks who use
+our
+slack for spam, advertisements and messages not related to the pandas contributing community. And
+please remember that slack is not meant to replace the mailing list or issue tracker - all important
+announcements and conversations should still happen there.
diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
@@ -812,7 +812,8 @@ install pandas) by typing::
     your installation is probably fine and you can start contributing!
 
 Often it is worth running only a subset of tests first around your changes before running the
-entire suite.
+entire suite (tip: you can use the [pandas-coverage app](https://pandas-coverage.herokuapp.com/)
+to find out which tests hit the lines of code you've modified, and then run only those).
 
 The easiest way to do this is with::
 

diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst
@@ -533,7 +533,7 @@ Data sets do not only contain numerical data. pandas provides a wide range of fu
 Coming from...
 --------------
 
-Are you familiar with other software for manipulating tablular data? Learn
+Are you familiar with other software for manipulating tabular data? Learn
 the pandas-equivalent operations compared to software you already know:
 
 .. panels::