-
-
Notifications
You must be signed in to change notification settings - Fork 12
Add round-trip casts between unicode and ASCIIDType #13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
f5d9204
add missing return for error case
ngoldbaum 09a84ef
add missing error checking in get_value
ngoldbaum 38630cc
remove incorrect decref for borrowed reference
ngoldbaum f7bb0ba
add NPY_UNUSED for unused parameters to ascii_to_ascii_get_loop
ngoldbaum 9e86760
increase maximum allowed line length
ngoldbaum bc38f89
add ascii to unicode and unicode to ascii casts
ngoldbaum d73cec2
make new_asciidtype_instance take a long instead of PyObject*
ngoldbaum 23ad336
ascii <-> unicode resolve_descriptors return correct descriptors for …
ngoldbaum e4a019a
remove get_loop and fix casting safety
ngoldbaum bcbc0c7
use unsigned char in ucs4_character_is_ascii
ngoldbaum 3f21ba6
simplify ascii to unicode casting use PY_UCS types
ngoldbaum 7b88321
simplify unicode to ascii cast
ngoldbaum dae600b
don't use NPY_METH_REQUIRES_PYAPI
ngoldbaum File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
[flake8] | ||
per-file-ignores = __init__.py:F401 | ||
max-line-length = 160 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,13 +28,17 @@ ascii_to_ascii_resolve_descriptors(PyObject *NPY_UNUSED(self), | |
loop_descrs[1] = given_descrs[1]; | ||
} | ||
|
||
if (((ASCIIDTypeObject *)loop_descrs[0])->size == | ||
((ASCIIDTypeObject *)loop_descrs[1])->size) { | ||
long in_size = ((ASCIIDTypeObject *)loop_descrs[0])->size; | ||
long out_size = ((ASCIIDTypeObject *)loop_descrs[1])->size; | ||
|
||
if (in_size == out_size) { | ||
*view_offset = 0; | ||
return NPY_NO_CASTING; | ||
} | ||
|
||
return NPY_SAME_KIND_CASTING; | ||
else if (in_size > out_size) { | ||
return NPY_UNSAFE_CASTING; | ||
} | ||
return NPY_SAFE_CASTING; | ||
} | ||
|
||
static int | ||
|
@@ -72,33 +76,224 @@ ascii_to_ascii(PyArrayMethod_Context *context, char *const data[], | |
return 0; | ||
} | ||
|
||
static NPY_CASTING | ||
unicode_to_ascii_resolve_descriptors(PyObject *NPY_UNUSED(self), | ||
PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]), | ||
PyArray_Descr *given_descrs[2], | ||
PyArray_Descr *loop_descrs[2], | ||
npy_intp *NPY_UNUSED(view_offset)) | ||
{ | ||
Py_INCREF(given_descrs[0]); | ||
loop_descrs[0] = given_descrs[0]; | ||
// numpy stores unicode as UCS4 (4 bytes wide), so bitshift | ||
// by 2 to get the number of ASCII bytes needed | ||
long in_size = (loop_descrs[0]->elsize) >> 2; | ||
if (given_descrs[1] == NULL) { | ||
ASCIIDTypeObject *ascii_descr = new_asciidtype_instance(in_size); | ||
loop_descrs[1] = (PyArray_Descr *)ascii_descr; | ||
} | ||
else { | ||
Py_INCREF(given_descrs[1]); | ||
loop_descrs[1] = given_descrs[1]; | ||
} | ||
|
||
long out_size = ((ASCIIDTypeObject *)loop_descrs[1])->size; | ||
|
||
if (out_size >= in_size) { | ||
return NPY_SAFE_CASTING; | ||
} | ||
|
||
return NPY_UNSAFE_CASTING; | ||
} | ||
|
||
static int | ||
ascii_to_ascii_get_loop(PyArrayMethod_Context *context, int aligned, | ||
int NPY_UNUSED(move_references), | ||
const npy_intp *strides, | ||
PyArrayMethod_StridedLoop **out_loop, | ||
NpyAuxData **NPY_UNUSED(out_transferdata), | ||
NPY_ARRAYMETHOD_FLAGS *flags) | ||
unicode_to_ascii(PyArrayMethod_Context *context, char *const data[], | ||
npy_intp const dimensions[], npy_intp const strides[], | ||
NpyAuxData *NPY_UNUSED(auxdata)) | ||
{ | ||
*out_loop = (PyArrayMethod_StridedLoop *)&ascii_to_ascii; | ||
PyArray_Descr **descrs = context->descriptors; | ||
long in_size = (descrs[0]->elsize) / 4; | ||
long out_size = ((ASCIIDTypeObject *)descrs[1])->size; | ||
long copy_size; | ||
|
||
if (out_size > in_size) { | ||
copy_size = in_size; | ||
} | ||
else { | ||
copy_size = out_size; | ||
} | ||
|
||
npy_intp N = dimensions[0]; | ||
char *in = data[0]; | ||
char *out = data[1]; | ||
npy_intp in_stride = strides[0]; | ||
npy_intp out_stride = strides[1]; | ||
|
||
while (N--) { | ||
// copy input characters, checking that input UCS4 | ||
// characters are all ascii, raising an error otherwise | ||
for (int i = 0; i < copy_size; i++) { | ||
Py_UCS4 c = ((Py_UCS4 *)in)[i]; | ||
if (c > 127) { | ||
PyErr_SetString( | ||
PyExc_TypeError, | ||
"Can only store ASCII text in a ASCIIDType array."); | ||
return -1; | ||
} | ||
// UCS4 character is ascii, so casting to Py_UCS1 does not truncate | ||
out[i] = (Py_UCS1)c; | ||
} | ||
// write zeros to remaining ASCII characters (if any) | ||
for (int i = copy_size; i < out_size; i++) { | ||
*(out + i) = '\0'; | ||
} | ||
in += in_stride; | ||
out += out_stride; | ||
} | ||
|
||
*flags = 0; | ||
return 0; | ||
} | ||
|
||
static int | ||
ascii_to_unicode(PyArrayMethod_Context *context, char *const data[], | ||
npy_intp const dimensions[], npy_intp const strides[], | ||
NpyAuxData *NPY_UNUSED(auxdata)) | ||
{ | ||
PyArray_Descr **descrs = context->descriptors; | ||
long in_size = ((ASCIIDTypeObject *)descrs[0])->size; | ||
long out_size = (descrs[1]->elsize) / 4; | ||
long copy_size; | ||
|
||
if (out_size > in_size) { | ||
copy_size = in_size; | ||
} | ||
else { | ||
copy_size = out_size; | ||
} | ||
|
||
npy_intp N = dimensions[0]; | ||
char *in = data[0]; | ||
char *out = data[1]; | ||
npy_intp in_stride = strides[0]; | ||
npy_intp out_stride = strides[1]; | ||
|
||
while (N--) { | ||
// copy ASCII input to first byte, fill rest with zeros | ||
for (int i = 0; i < copy_size; i++) { | ||
((Py_UCS4 *)out)[i] = ((Py_UCS1 *)in)[i]; | ||
} | ||
// fill all remaining UCS4 characters with zeros | ||
for (int i = copy_size; i < out_size; i++) { | ||
((Py_UCS4 *)out)[i] = (Py_UCS1)0; | ||
} | ||
in += in_stride; | ||
out += out_stride; | ||
} | ||
return 0; | ||
} | ||
|
||
static NPY_CASTING | ||
ascii_to_unicode_resolve_descriptors(PyObject *NPY_UNUSED(self), | ||
PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]), | ||
PyArray_Descr *given_descrs[2], | ||
PyArray_Descr *loop_descrs[2], | ||
npy_intp *NPY_UNUSED(view_offset)) | ||
{ | ||
Py_INCREF(given_descrs[0]); | ||
loop_descrs[0] = given_descrs[0]; | ||
long in_size = ((ASCIIDTypeObject *)given_descrs[0])->size; | ||
if (given_descrs[1] == NULL) { | ||
PyArray_Descr *unicode_descr = PyArray_DescrNewFromType(NPY_UNICODE); | ||
// numpy stores unicode as UCS4 (4 bytes wide), so bitshift | ||
// by 2 to get the number of bytes needed to store the UCS4 charaters | ||
unicode_descr->elsize = in_size << 2; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My very personal thing would be to use |
||
loop_descrs[1] = unicode_descr; | ||
} | ||
else { | ||
Py_INCREF(given_descrs[1]); | ||
loop_descrs[1] = given_descrs[1]; | ||
} | ||
|
||
long out_size = (loop_descrs[1]->elsize) >> 2; | ||
|
||
if (out_size >= in_size) { | ||
return NPY_SAFE_CASTING; | ||
} | ||
|
||
return NPY_UNSAFE_CASTING; | ||
} | ||
|
||
static PyArray_DTypeMeta *a2a_dtypes[2] = {NULL, NULL}; | ||
|
||
static PyType_Slot a2a_slots[] = { | ||
{NPY_METH_resolve_descriptors, &ascii_to_ascii_resolve_descriptors}, | ||
{_NPY_METH_get_loop, &ascii_to_ascii_get_loop}, | ||
{NPY_METH_strided_loop, &ascii_to_ascii}, | ||
{NPY_METH_unaligned_strided_loop, &ascii_to_ascii}, | ||
{0, NULL}}; | ||
|
||
PyArrayMethod_Spec ASCIIToASCIICastSpec = { | ||
.name = "cast_ASCIIDType_to_ASCIIDType", | ||
.nin = 1, | ||
.nout = 1, | ||
.flags = NPY_METH_SUPPORTS_UNALIGNED, | ||
.casting = NPY_SAME_KIND_CASTING, | ||
.casting = NPY_UNSAFE_CASTING, | ||
.flags = (NPY_METH_NO_FLOATINGPOINT_ERRORS | | ||
NPY_METH_SUPPORTS_UNALIGNED), | ||
.dtypes = a2a_dtypes, | ||
.slots = a2a_slots, | ||
}; | ||
|
||
static PyType_Slot u2a_slots[] = { | ||
{NPY_METH_resolve_descriptors, &unicode_to_ascii_resolve_descriptors}, | ||
{NPY_METH_strided_loop, &unicode_to_ascii}, | ||
{0, NULL}}; | ||
|
||
static char *u2a_name = "cast_Unicode_to_ASCIIDType"; | ||
|
||
static PyType_Slot a2u_slots[] = { | ||
{NPY_METH_resolve_descriptors, &ascii_to_unicode_resolve_descriptors}, | ||
{NPY_METH_strided_loop, &ascii_to_unicode}, | ||
{0, NULL}}; | ||
|
||
static char *a2u_name = "cast_ASCIIDType_to_Unicode"; | ||
|
||
PyArrayMethod_Spec ** | ||
get_casts(void) | ||
{ | ||
PyArray_DTypeMeta **u2a_dtypes = malloc(2 * sizeof(PyArray_DTypeMeta *)); | ||
u2a_dtypes[0] = &PyArray_UnicodeDType; | ||
u2a_dtypes[1] = NULL; | ||
|
||
PyArrayMethod_Spec *UnicodeToASCIICastSpec = | ||
malloc(sizeof(PyArrayMethod_Spec)); | ||
|
||
UnicodeToASCIICastSpec->name = u2a_name; | ||
UnicodeToASCIICastSpec->nin = 1; | ||
UnicodeToASCIICastSpec->nout = 1; | ||
UnicodeToASCIICastSpec->casting = NPY_UNSAFE_CASTING; | ||
UnicodeToASCIICastSpec->flags = NPY_METH_NO_FLOATINGPOINT_ERRORS; | ||
UnicodeToASCIICastSpec->dtypes = u2a_dtypes; | ||
UnicodeToASCIICastSpec->slots = u2a_slots; | ||
|
||
PyArray_DTypeMeta **a2u_dtypes = malloc(2 * sizeof(PyArray_DTypeMeta *)); | ||
a2u_dtypes[0] = NULL; | ||
a2u_dtypes[1] = &PyArray_UnicodeDType; | ||
|
||
PyArrayMethod_Spec *ASCIIToUnicodeCastSpec = | ||
malloc(sizeof(PyArrayMethod_Spec)); | ||
|
||
ASCIIToUnicodeCastSpec->name = a2u_name; | ||
ASCIIToUnicodeCastSpec->nin = 1; | ||
ASCIIToUnicodeCastSpec->nout = 1; | ||
ASCIIToUnicodeCastSpec->casting = NPY_UNSAFE_CASTING; | ||
ASCIIToUnicodeCastSpec->flags = NPY_METH_NO_FLOATINGPOINT_ERRORS; | ||
ASCIIToUnicodeCastSpec->dtypes = a2u_dtypes; | ||
ASCIIToUnicodeCastSpec->slots = a2u_slots; | ||
|
||
PyArrayMethod_Spec **casts = malloc(4 * sizeof(PyArrayMethod_Spec *)); | ||
casts[0] = &ASCIIToASCIICastSpec; | ||
casts[1] = UnicodeToASCIICastSpec; | ||
casts[2] = ASCIIToUnicodeCastSpec; | ||
casts[3] = NULL; | ||
|
||
return casts; | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, this type of setup could of course also be done in the
get_loop
if it helps a lot. But it hardly matters in practice...(I guess it might for HPy support or so, but that is another tricky thing to figure out one day.)