Skip to content

Commit

Permalink
apacheGH-36730: [Python] Add support for Cython 3.0.0 (apache#37097)
Browse files Browse the repository at this point in the history
### Rationale for this change

Cython 3.0.0 is the latest release. PyArrow should work with Cython 3.0.0. **Cython 3 is not enabled in this diff.**

### What changes are included in this PR?

* Don't use `vector[XXX]&&`
* Add a declaration for `postincrement`
  * See also: https://cython.readthedocs.io/en/latest/src/userguide/migrating_to_cy30.html#c-postincrement-postdecrement-operator
* Ignore `C4551` warning (function call missing argument list) with MSVC
  * See also: cython/cython#4445
* Add missing `const` to `CLocation`'s static methods.
* Don't use `StopIteration` to stop generator
  * See also: https://cython.readthedocs.io/en/latest/src/userguide/migrating_to_cy30.html#python-3-syntax-semantics
* non-extern `cdef` functions will now propagate python exceptions automatically unless explicitly labeled `noexcept`
* Function binding in cython is now enabled by default. Class methods that are used as wrappers for pickling should be converted to staticmethods.
* Numpydocs now validates more Cython 3 objects than Cython <3
  * Enum types are now being validated, and some unhelpful validation checks on Enums are now ignored
* Added a cython <3 nightly CI job

Note:
* Cython 3.0.0, 3.0.1, 3.0.2 has an issue when compiling with debug mode cython/cython#5552

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.

* Closes: apache#36730

Lead-authored-by: Dane Pitkin <dane@voltrondata.com>
Co-authored-by: Sutou Kouhei <kou@clear-code.com>
Co-authored-by: Dane Pitkin <48041712+danepitkin@users.noreply.github.com>
Signed-off-by: AlenkaF <frim.alenka@gmail.com>
  • Loading branch information
2 people authored and loicalleyne committed Nov 13, 2023
1 parent 154dc35 commit 758c3c4
Show file tree
Hide file tree
Showing 13 changed files with 124 additions and 31 deletions.
24 changes: 24 additions & 0 deletions ci/docker/conda-python-cython2.dockerfile
@@ -0,0 +1,24 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

ARG repo
ARG arch
ARG python=3.8
FROM ${repo}:${arch}-conda-python-${python}

RUN mamba install -q -y "cython<3" && \
mamba clean --all
9 changes: 9 additions & 0 deletions dev/archery/archery/lang/python.py
Expand Up @@ -16,6 +16,7 @@
# under the License.

from contextlib import contextmanager
from enum import EnumMeta
import inspect
import tokenize

Expand Down Expand Up @@ -112,6 +113,10 @@ def inspect_signature(obj):


class NumpyDoc:
IGNORE_VALIDATION_ERRORS_FOR_TYPE = {
# Enum function signatures should never be documented
EnumMeta: ["PR01"]
}

def __init__(self, symbols=None):
if not have_numpydoc:
Expand Down Expand Up @@ -229,6 +234,10 @@ def callback(obj):
continue
if disallow_rules and errcode in disallow_rules:
continue
if any(isinstance(obj, obj_type) and errcode in errcode_list
for obj_type, errcode_list
in NumpyDoc.IGNORE_VALIDATION_ERRORS_FOR_TYPE.items()):
continue
errors.append((errcode, errmsg))

if len(errors):
Expand Down
8 changes: 8 additions & 0 deletions dev/tasks/tasks.yml
Expand Up @@ -1286,6 +1286,14 @@ tasks:
PYTHON: "3.10"
image: conda-python-substrait

test-conda-python-3.10-cython2:
ci: github
template: docker-tests/github.linux.yml
params:
env:
PYTHON: "3.10"
image: conda-python-cython2

test-debian-11-python-3:
ci: azure
template: docker-tests/azure.linux.yml
Expand Down
25 changes: 25 additions & 0 deletions docker-compose.yml
Expand Up @@ -119,6 +119,7 @@ x-hierarchy:
- conda-python:
- conda-python-pandas:
- conda-python-docs
- conda-python-cython2
- conda-python-dask
- conda-python-hdfs
- conda-python-java-integration
Expand Down Expand Up @@ -1349,6 +1350,30 @@ services:
/arrow/ci/scripts/java_build.sh /arrow /build /tmp/dist/java &&
/arrow/ci/scripts/java_cdata_integration.sh /arrow /tmp/dist/java" ]

conda-python-cython2:
# Usage:
# docker-compose build conda
# docker-compose build conda-cpp
# docker-compose build conda-python
# docker-compose build conda-python-cython2
# docker-compose run --rm conda-python-cython2
image: ${REPO}:${ARCH}-conda-python-${PYTHON}-cython2
build:
context: .
dockerfile: ci/docker/conda-python-cython2.dockerfile
cache_from:
- ${REPO}:${ARCH}-conda-python-${PYTHON}-cython2
args:
repo: ${REPO}
arch: ${ARCH}
python: ${PYTHON}
shm_size: *shm-size
environment:
<<: [*common, *ccache]
PYTEST_ARGS: # inherit
volumes: *conda-volumes
command: *python-conda-command

################################## R ########################################

ubuntu-r:
Expand Down
29 changes: 18 additions & 11 deletions python/CMakeLists.txt
Expand Up @@ -168,37 +168,44 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PYARROW_CXXFLAGS}")

if(MSVC)
# MSVC version of -Wno-return-type-c-linkage
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4190")
string(APPEND CMAKE_CXX_FLAGS " /wd4190")

# Cython generates some bitshift expressions that MSVC does not like in
# __Pyx_PyFloat_DivideObjC
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4293")
string(APPEND CMAKE_CXX_FLAGS " /wd4293")

# Converting to/from C++ bool is pretty wonky in Cython. The C4800 warning
# seem harmless, and probably not worth the effort of working around it
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4800")
string(APPEND CMAKE_CXX_FLAGS " /wd4800")

# See https://github.com/cython/cython/issues/2731. Change introduced in
# Cython 0.29.1 causes "unsafe use of type 'bool' in operation"
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4804")
string(APPEND CMAKE_CXX_FLAGS " /wd4804")

# See https://github.com/cython/cython/issues/4445.
#
# Cython 3 emits "(void)__Pyx_PyObject_CallMethod0;" to suppress a
# "unused function" warning but the code emits another "function
# call missing argument list" warning.
string(APPEND CMAKE_CXX_FLAGS " /wd4551")
else()
# Enable perf and other tools to work properly
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
string(APPEND CMAKE_CXX_FLAGS " -fno-omit-frame-pointer")

# Suppress Cython warnings
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable -Wno-maybe-uninitialized")
string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-variable -Wno-maybe-uninitialized")

if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL
"Clang")
# Cython warnings in clang
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constant-logical-operand")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-declarations")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sometimes-uninitialized")
string(APPEND CMAKE_CXX_FLAGS " -Wno-parentheses-equality")
string(APPEND CMAKE_CXX_FLAGS " -Wno-constant-logical-operand")
string(APPEND CMAKE_CXX_FLAGS " -Wno-missing-declarations")
string(APPEND CMAKE_CXX_FLAGS " -Wno-sometimes-uninitialized")

# We have public Cython APIs which return C++ types, which are in an extern
# "C" blog (no symbol mangling) and clang doesn't like this
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-return-type-c-linkage")
string(APPEND CMAKE_CXX_FLAGS " -Wno-return-type-c-linkage")
endif()
endif()

Expand Down
3 changes: 2 additions & 1 deletion python/pyarrow/_dataset.pyx
Expand Up @@ -1078,7 +1078,8 @@ cdef class FileSystemDataset(Dataset):
@classmethod
def from_paths(cls, paths, schema=None, format=None,
filesystem=None, partitions=None, root_partition=None):
"""A Dataset created from a list of paths on a particular filesystem.
"""
A Dataset created from a list of paths on a particular filesystem.
Parameters
----------
Expand Down
10 changes: 7 additions & 3 deletions python/pyarrow/_flight.pyx
Expand Up @@ -988,8 +988,10 @@ cdef class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin):
cdef shared_ptr[CMetadataRecordBatchReader] reader

def __iter__(self):
while True:
yield self.read_chunk()
return self

def __next__(self):
return self.read_chunk()

@property
def schema(self):
Expand Down Expand Up @@ -1699,7 +1701,9 @@ cdef class FlightClient(_Weakrefable):

def close(self):
"""Close the client and disconnect."""
check_flight_status(self.client.get().Close())
client = self.client.get()
if client != NULL:
check_flight_status(client.Close())

def __del__(self):
# Not ideal, but close() wasn't originally present so
Expand Down
3 changes: 2 additions & 1 deletion python/pyarrow/_substrait.pyx
Expand Up @@ -27,9 +27,10 @@ from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_substrait cimport *


# TODO GH-37235: Fix exception handling
cdef CDeclaration _create_named_table_provider(
dict named_args, const std_vector[c_string]& names, const CSchema& schema
):
) noexcept:
cdef:
c_string c_name
shared_ptr[CTable] c_in_table
Expand Down
15 changes: 10 additions & 5 deletions python/pyarrow/includes/libarrow_flight.pxd
Expand Up @@ -118,16 +118,16 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil:
c_bool Equals(const CLocation& other)

@staticmethod
CResult[CLocation] Parse(c_string& uri_string)
CResult[CLocation] Parse(const c_string& uri_string)

@staticmethod
CResult[CLocation] ForGrpcTcp(c_string& host, int port)
CResult[CLocation] ForGrpcTcp(const c_string& host, int port)

@staticmethod
CResult[CLocation] ForGrpcTls(c_string& host, int port)
CResult[CLocation] ForGrpcTls(const c_string& host, int port)

@staticmethod
CResult[CLocation] ForGrpcUnix(c_string& path)
CResult[CLocation] ForGrpcUnix(const c_string& path)

cdef cppclass CFlightEndpoint" arrow::flight::FlightEndpoint":
CFlightEndpoint()
Expand Down Expand Up @@ -172,7 +172,9 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil:
CResult[unique_ptr[CFlightInfo]] Next()

cdef cppclass CSimpleFlightListing" arrow::flight::SimpleFlightListing":
CSimpleFlightListing(vector[CFlightInfo]&& info)
# This doesn't work with Cython >= 3
# CSimpleFlightListing(vector[CFlightInfo]&& info)
CSimpleFlightListing(const vector[CFlightInfo]& info)

cdef cppclass CFlightPayload" arrow::flight::FlightPayload":
shared_ptr[CBuffer] descriptor
Expand Down Expand Up @@ -310,7 +312,10 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil:
cdef cppclass CCallHeaders" arrow::flight::CallHeaders":
cppclass const_iterator:
pair[c_string, c_string] operator*()
# For Cython < 3
const_iterator operator++()
# For Cython >= 3
const_iterator operator++(int)
bint operator==(const_iterator)
bint operator!=(const_iterator)
const_iterator cbegin()
Expand Down
15 changes: 8 additions & 7 deletions python/pyarrow/ipc.pxi
Expand Up @@ -436,8 +436,10 @@ cdef class MessageReader(_Weakrefable):
return result

def __iter__(self):
while True:
yield self.read_next_message()
return self

def __next__(self):
return self.read_next_message()

def read_next_message(self):
"""
Expand Down Expand Up @@ -656,11 +658,10 @@ cdef class RecordBatchReader(_Weakrefable):
# cdef block is in lib.pxd

def __iter__(self):
while True:
try:
yield self.read_next_batch()
except StopIteration:
return
return self

def __next__(self):
return self.read_next_batch()

@property
def schema(self):
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/scalar.pxi
Expand Up @@ -819,8 +819,8 @@ cdef class MapScalar(ListScalar):
Iterate over this element's values.
"""
arr = self.values
if array is None:
raise StopIteration
if arr is None:
return
for k, v in zip(arr.field(self.type.key_field.name), arr.field(self.type.item_field.name)):
yield (k.as_py(), v.as_py())

Expand Down
6 changes: 5 additions & 1 deletion python/pyarrow/tests/test_dataset.py
Expand Up @@ -1615,9 +1615,13 @@ def test_fragments_repr(tempdir, dataset):
# partitioned parquet dataset
fragment = list(dataset.get_fragments())[0]
assert (
# Ordering of partition items is non-deterministic
repr(fragment) ==
"<pyarrow.dataset.ParquetFileFragment path=subdir/1/xxx/file0.parquet "
"partition=[key=xxx, group=1]>"
"partition=[key=xxx, group=1]>" or
repr(fragment) ==
"<pyarrow.dataset.ParquetFileFragment path=subdir/1/xxx/file0.parquet "
"partition=[group=1, key=xxx]>"
)

# single-file parquet dataset (no partition information in repr)
Expand Down
4 changes: 4 additions & 0 deletions python/pyarrow/tests/test_scalars.py
Expand Up @@ -700,6 +700,10 @@ def test_map(pickle_module):
for i, j in zip(s, v):
assert i == j

# test iteration with missing values
for _ in pa.scalar(None, type=ty):
pass

assert s.as_py() == v
assert s[1] == (
pa.scalar('b', type=pa.string()),
Expand Down

0 comments on commit 758c3c4

Please sign in to comment.