From 758c3c4d8bce8cae9012ac7cd46e14c38be96c78 Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Thu, 21 Sep 2023 11:06:38 -0400 Subject: [PATCH] GH-36730: [Python] Add support for Cython 3.0.0 (#37097) ### Rationale for this change Cython 3.0.0 is the latest release. PyArrow should work with Cython 3.0.0. **Cython 3 is not enabled in this diff.** ### What changes are included in this PR? * Don't use `vector[XXX]&&` * Add a declaration for `postincrement` * See also: https://cython.readthedocs.io/en/latest/src/userguide/migrating_to_cy30.html#c-postincrement-postdecrement-operator * Ignore `C4551` warning (function call missing argument list) with MSVC * See also: https://github.com/cython/cython/issues/4445 * Add missing `const` to `CLocation`'s static methods. * Don't use `StopIteration` to stop generator * See also: https://cython.readthedocs.io/en/latest/src/userguide/migrating_to_cy30.html#python-3-syntax-semantics * non-extern `cdef` functions will now propagate python exceptions automatically unless explicitly labeled `noexcept` * Function binding in cython is now enabled by default. Class methods that are used as wrappers for pickling should be converted to staticmethods. * Numpydocs now validates more Cython 3 objects than Cython <3 * Enum types are now being validated, and some unhelpful validation checks on Enums are now ignored * Added a cython <3 nightly CI job Note: * Cython 3.0.0, 3.0.1, 3.0.2 has an issue when compiling with debug mode https://github.com/cython/cython/issues/5552 ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #36730 Lead-authored-by: Dane Pitkin Co-authored-by: Sutou Kouhei Co-authored-by: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Signed-off-by: AlenkaF --- ci/docker/conda-python-cython2.dockerfile | 24 +++++++++++++++++ dev/archery/archery/lang/python.py | 9 +++++++ dev/tasks/tasks.yml | 8 ++++++ docker-compose.yml | 25 ++++++++++++++++++ python/CMakeLists.txt | 29 +++++++++++++-------- python/pyarrow/_dataset.pyx | 3 ++- python/pyarrow/_flight.pyx | 10 ++++--- python/pyarrow/_substrait.pyx | 3 ++- python/pyarrow/includes/libarrow_flight.pxd | 15 +++++++---- python/pyarrow/ipc.pxi | 15 ++++++----- python/pyarrow/scalar.pxi | 4 +-- python/pyarrow/tests/test_dataset.py | 6 ++++- python/pyarrow/tests/test_scalars.py | 4 +++ 13 files changed, 124 insertions(+), 31 deletions(-) create mode 100644 ci/docker/conda-python-cython2.dockerfile diff --git a/ci/docker/conda-python-cython2.dockerfile b/ci/docker/conda-python-cython2.dockerfile new file mode 100644 index 0000000000000..d67ef677276c7 --- /dev/null +++ b/ci/docker/conda-python-cython2.dockerfile @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG repo +ARG arch +ARG python=3.8 +FROM ${repo}:${arch}-conda-python-${python} + +RUN mamba install -q -y "cython<3" && \ + mamba clean --all diff --git a/dev/archery/archery/lang/python.py b/dev/archery/archery/lang/python.py index 8600a0d7c48c0..d4c1853d097b2 100644 --- a/dev/archery/archery/lang/python.py +++ b/dev/archery/archery/lang/python.py @@ -16,6 +16,7 @@ # under the License. from contextlib import contextmanager +from enum import EnumMeta import inspect import tokenize @@ -112,6 +113,10 @@ def inspect_signature(obj): class NumpyDoc: + IGNORE_VALIDATION_ERRORS_FOR_TYPE = { + # Enum function signatures should never be documented + EnumMeta: ["PR01"] + } def __init__(self, symbols=None): if not have_numpydoc: @@ -229,6 +234,10 @@ def callback(obj): continue if disallow_rules and errcode in disallow_rules: continue + if any(isinstance(obj, obj_type) and errcode in errcode_list + for obj_type, errcode_list + in NumpyDoc.IGNORE_VALIDATION_ERRORS_FOR_TYPE.items()): + continue errors.append((errcode, errmsg)) if len(errors): diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index ed238778635d3..29e038a922412 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1286,6 +1286,14 @@ tasks: PYTHON: "3.10" image: conda-python-substrait + test-conda-python-3.10-cython2: + ci: github + template: docker-tests/github.linux.yml + params: + env: + PYTHON: "3.10" + image: conda-python-cython2 + test-debian-11-python-3: ci: azure template: docker-tests/azure.linux.yml diff --git a/docker-compose.yml b/docker-compose.yml index a79b13c0a5f91..8ae06900c57f9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -119,6 +119,7 @@ x-hierarchy: - conda-python: - conda-python-pandas: - conda-python-docs + - conda-python-cython2 - conda-python-dask - conda-python-hdfs - conda-python-java-integration @@ -1349,6 +1350,30 @@ services: /arrow/ci/scripts/java_build.sh /arrow /build /tmp/dist/java && /arrow/ci/scripts/java_cdata_integration.sh /arrow /tmp/dist/java" ] + conda-python-cython2: + # Usage: + # docker-compose build conda + # docker-compose build conda-cpp + # docker-compose build conda-python + # docker-compose build conda-python-cython2 + # docker-compose run --rm conda-python-cython2 + image: ${REPO}:${ARCH}-conda-python-${PYTHON}-cython2 + build: + context: . + dockerfile: ci/docker/conda-python-cython2.dockerfile + cache_from: + - ${REPO}:${ARCH}-conda-python-${PYTHON}-cython2 + args: + repo: ${REPO} + arch: ${ARCH} + python: ${PYTHON} + shm_size: *shm-size + environment: + <<: [*common, *ccache] + PYTEST_ARGS: # inherit + volumes: *conda-volumes + command: *python-conda-command + ################################## R ######################################## ubuntu-r: diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 242ba8448f4a6..29f8d2da72f3a 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -168,37 +168,44 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PYARROW_CXXFLAGS}") if(MSVC) # MSVC version of -Wno-return-type-c-linkage - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4190") + string(APPEND CMAKE_CXX_FLAGS " /wd4190") # Cython generates some bitshift expressions that MSVC does not like in # __Pyx_PyFloat_DivideObjC - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4293") + string(APPEND CMAKE_CXX_FLAGS " /wd4293") # Converting to/from C++ bool is pretty wonky in Cython. The C4800 warning # seem harmless, and probably not worth the effort of working around it - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4800") + string(APPEND CMAKE_CXX_FLAGS " /wd4800") # See https://github.com/cython/cython/issues/2731. Change introduced in # Cython 0.29.1 causes "unsafe use of type 'bool' in operation" - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4804") + string(APPEND CMAKE_CXX_FLAGS " /wd4804") + + # See https://github.com/cython/cython/issues/4445. + # + # Cython 3 emits "(void)__Pyx_PyObject_CallMethod0;" to suppress a + # "unused function" warning but the code emits another "function + # call missing argument list" warning. + string(APPEND CMAKE_CXX_FLAGS " /wd4551") else() # Enable perf and other tools to work properly - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer") + string(APPEND CMAKE_CXX_FLAGS " -fno-omit-frame-pointer") # Suppress Cython warnings - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable -Wno-maybe-uninitialized") + string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-variable -Wno-maybe-uninitialized") if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") # Cython warnings in clang - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constant-logical-operand") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-declarations") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sometimes-uninitialized") + string(APPEND CMAKE_CXX_FLAGS " -Wno-parentheses-equality") + string(APPEND CMAKE_CXX_FLAGS " -Wno-constant-logical-operand") + string(APPEND CMAKE_CXX_FLAGS " -Wno-missing-declarations") + string(APPEND CMAKE_CXX_FLAGS " -Wno-sometimes-uninitialized") # We have public Cython APIs which return C++ types, which are in an extern # "C" blog (no symbol mangling) and clang doesn't like this - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-return-type-c-linkage") + string(APPEND CMAKE_CXX_FLAGS " -Wno-return-type-c-linkage") endif() endif() diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index d29fa125e2061..48ee676915311 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1078,7 +1078,8 @@ cdef class FileSystemDataset(Dataset): @classmethod def from_paths(cls, paths, schema=None, format=None, filesystem=None, partitions=None, root_partition=None): - """A Dataset created from a list of paths on a particular filesystem. + """ + A Dataset created from a list of paths on a particular filesystem. Parameters ---------- diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index 42b221ed72a1b..79aa24e4ce8e3 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -988,8 +988,10 @@ cdef class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): cdef shared_ptr[CMetadataRecordBatchReader] reader def __iter__(self): - while True: - yield self.read_chunk() + return self + + def __next__(self): + return self.read_chunk() @property def schema(self): @@ -1699,7 +1701,9 @@ cdef class FlightClient(_Weakrefable): def close(self): """Close the client and disconnect.""" - check_flight_status(self.client.get().Close()) + client = self.client.get() + if client != NULL: + check_flight_status(client.Close()) def __del__(self): # Not ideal, but close() wasn't originally present so diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index 4efad2c4d1bc5..067cb5f91681b 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -27,9 +27,10 @@ from pyarrow.includes.libarrow cimport * from pyarrow.includes.libarrow_substrait cimport * +# TODO GH-37235: Fix exception handling cdef CDeclaration _create_named_table_provider( dict named_args, const std_vector[c_string]& names, const CSchema& schema -): +) noexcept: cdef: c_string c_name shared_ptr[CTable] c_in_table diff --git a/python/pyarrow/includes/libarrow_flight.pxd b/python/pyarrow/includes/libarrow_flight.pxd index 4bddd2d080f5f..c4cf5830c4128 100644 --- a/python/pyarrow/includes/libarrow_flight.pxd +++ b/python/pyarrow/includes/libarrow_flight.pxd @@ -118,16 +118,16 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil: c_bool Equals(const CLocation& other) @staticmethod - CResult[CLocation] Parse(c_string& uri_string) + CResult[CLocation] Parse(const c_string& uri_string) @staticmethod - CResult[CLocation] ForGrpcTcp(c_string& host, int port) + CResult[CLocation] ForGrpcTcp(const c_string& host, int port) @staticmethod - CResult[CLocation] ForGrpcTls(c_string& host, int port) + CResult[CLocation] ForGrpcTls(const c_string& host, int port) @staticmethod - CResult[CLocation] ForGrpcUnix(c_string& path) + CResult[CLocation] ForGrpcUnix(const c_string& path) cdef cppclass CFlightEndpoint" arrow::flight::FlightEndpoint": CFlightEndpoint() @@ -172,7 +172,9 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil: CResult[unique_ptr[CFlightInfo]] Next() cdef cppclass CSimpleFlightListing" arrow::flight::SimpleFlightListing": - CSimpleFlightListing(vector[CFlightInfo]&& info) + # This doesn't work with Cython >= 3 + # CSimpleFlightListing(vector[CFlightInfo]&& info) + CSimpleFlightListing(const vector[CFlightInfo]& info) cdef cppclass CFlightPayload" arrow::flight::FlightPayload": shared_ptr[CBuffer] descriptor @@ -310,7 +312,10 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil: cdef cppclass CCallHeaders" arrow::flight::CallHeaders": cppclass const_iterator: pair[c_string, c_string] operator*() + # For Cython < 3 const_iterator operator++() + # For Cython >= 3 + const_iterator operator++(int) bint operator==(const_iterator) bint operator!=(const_iterator) const_iterator cbegin() diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index a8398597fe6cd..53e521fc11468 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -436,8 +436,10 @@ cdef class MessageReader(_Weakrefable): return result def __iter__(self): - while True: - yield self.read_next_message() + return self + + def __next__(self): + return self.read_next_message() def read_next_message(self): """ @@ -656,11 +658,10 @@ cdef class RecordBatchReader(_Weakrefable): # cdef block is in lib.pxd def __iter__(self): - while True: - try: - yield self.read_next_batch() - except StopIteration: - return + return self + + def __next__(self): + return self.read_next_batch() @property def schema(self): diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index e07949c675524..9a66dc81226d4 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -819,8 +819,8 @@ cdef class MapScalar(ListScalar): Iterate over this element's values. """ arr = self.values - if array is None: - raise StopIteration + if arr is None: + return for k, v in zip(arr.field(self.type.key_field.name), arr.field(self.type.item_field.name)): yield (k.as_py(), v.as_py()) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index e0988f2752033..39c3c43daea37 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -1615,9 +1615,13 @@ def test_fragments_repr(tempdir, dataset): # partitioned parquet dataset fragment = list(dataset.get_fragments())[0] assert ( + # Ordering of partition items is non-deterministic repr(fragment) == "" + "partition=[key=xxx, group=1]>" or + repr(fragment) == + "" ) # single-file parquet dataset (no partition information in repr) diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 5f6c8c813f12a..8a1dcfb057f74 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -700,6 +700,10 @@ def test_map(pickle_module): for i, j in zip(s, v): assert i == j + # test iteration with missing values + for _ in pa.scalar(None, type=ty): + pass + assert s.as_py() == v assert s[1] == ( pa.scalar('b', type=pa.string()),