Skip to content

Better naming of public symbols and small string optimization #86

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
519 changes: 234 additions & 285 deletions stringdtype/stringdtype/src/casts.c

Large diffs are not rendered by default.

122 changes: 84 additions & 38 deletions stringdtype/stringdtype/src/dtype.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,30 @@ new_stringdtype_instance(PyObject *na_object, int coerce)

Py_XINCREF(na_object);
((StringDTypeObject *)new)->na_object = na_object;
npy_packed_static_string packed_na_name = *NPY_EMPTY_STRING;
npy_packed_static_string packed_default_string = *NPY_EMPTY_STRING;
int hasnull = na_object != NULL;
int has_nan_na = 0;
int has_string_na = 0;
ss default_string = EMPTY_STRING;
if (hasnull) {
// first check for a string
if (PyUnicode_Check(na_object)) {
has_string_na = 1;
Py_ssize_t size = 0;
const char *buf = PyUnicode_AsUTF8AndSize(na_object, &size);
default_string.len = size;
// discards const, how to avoid?
default_string.buf = (char *)buf;
int res = npy_string_newsize(buf, (size_t)size,
&packed_default_string);
if (res == -1) {
PyErr_NoMemory();
Py_DECREF(new);
return NULL;
}
else if (res == -2) {
// this should never happen
assert(0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like this can happen if users provide nonsense for the na_object?!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a helper would make sense, if we have the two different return values? Or pass int nogil and set the error already in npy_string_newsize?
(since more than MemoryError is possible here, unless we check the size for validity first.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like this can happen if users provide nonsense for the na_object?!

It would have to be a nonsense string that passes PyUnicode_Check()

You're right that the -2 return value here is a an API wart. I added it as a way for me to catch programming errors. Maybe I should replace that return case with a debug abort inside the static string library and make it so this only fails if you run out of memory?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, true. The user would have to try very hard... In principle tha twould mean we have to hceck the size up-front?
TBH, it is also just a minor wart if we raise a MemoryError rather than a size error in practice.

So OK either way, I though passing nogil and setting the error inside might be a way, but not sure it is much better.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to fix this in another PR but will get to it.

Py_DECREF(new);
return NULL;
}
}
else {
// treat as nan-like if != comparison returns a object whose truth
Expand All @@ -53,15 +64,50 @@ new_stringdtype_instance(PyObject *na_object, int coerce)
}
Py_DECREF(eq);
}
PyObject *na_pystr = PyObject_Str(na_object);
if (na_pystr == NULL) {
Py_DECREF(new);
return NULL;
}

Py_ssize_t size = 0;
const char *utf8_ptr = PyUnicode_AsUTF8AndSize(na_pystr, &size);
// discard const to initialize buffer
int res = npy_string_newsize(utf8_ptr, (size_t)size, &packed_na_name);
if (res == -1) {
PyErr_NoMemory();
Py_DECREF(new);
return NULL;
}
else if (res == -2) {
// this should never happen
assert(0);
Py_DECREF(new);
return NULL;
}
Py_DECREF(na_pystr);
}
((StringDTypeObject *)new)->has_nan_na = has_nan_na;
((StringDTypeObject *)new)->has_string_na = has_string_na;
((StringDTypeObject *)new)->default_string = default_string;
((StringDTypeObject *)new)->coerce = coerce;

StringDTypeObject *snew = (StringDTypeObject *)new;

snew->has_nan_na = has_nan_na;
snew->has_string_na = has_string_na;
snew->packed_default_string = packed_default_string;
snew->packed_na_name = packed_na_name;
snew->coerce = coerce;

npy_static_string default_string = {0, NULL};
npy_load_string(&snew->packed_default_string, &default_string);

npy_static_string na_name = {0, NULL};
npy_load_string(&snew->packed_na_name, &na_name);

snew->na_name = na_name;
snew->default_string = default_string;

PyArray_Descr *base = (PyArray_Descr *)new;
base->elsize = sizeof(ss);
base->alignment = _Alignof(ss);
base->elsize = sizeof(npy_static_string);
base->alignment = _Alignof(npy_static_string);
base->flags |= NPY_NEEDS_INIT;
base->flags |= NPY_LIST_PICKLE;
base->flags |= NPY_ITEM_REFCOUNT;
Expand Down Expand Up @@ -161,20 +207,19 @@ string_discover_descriptor_from_pyobject(PyTypeObject *NPY_UNUSED(cls),
int
stringdtype_setitem(StringDTypeObject *descr, PyObject *obj, char **dataptr)
{
ss *sdata = (ss *)dataptr;
npy_packed_static_string *sdata = (npy_packed_static_string *)dataptr;

// free if dataptr holds preexisting string data,
// ssfree does a NULL check
ssfree(sdata);
// npy_string_free does a NULL check and checks for small strings
npy_string_free(sdata);

// borrow reference
PyObject *na_object = descr->na_object;

// setting NA *must* check pointer equality since NA types might not
// allow equality
if (na_object != NULL && obj == na_object) {
// do nothing, ssfree already NULLed the struct ssdata points to
// so it already contains a NA value
*sdata = *NPY_NULL_STRING;
}
else {
PyObject *val_obj = get_value(obj, descr->coerce);
Expand All @@ -190,8 +235,7 @@ stringdtype_setitem(StringDTypeObject *descr, PyObject *obj, char **dataptr)
return -1;
}

// copies contents of val into item_val->buf
int res = ssnewlen(val, length, sdata);
int res = npy_string_newsize(val, length, sdata);

if (res == -1) {
PyErr_NoMemory();
Expand All @@ -213,10 +257,11 @@ static PyObject *
stringdtype_getitem(StringDTypeObject *descr, char **dataptr)
{
PyObject *val_obj = NULL;
ss *sdata = (ss *)dataptr;
npy_packed_static_string *psdata = (npy_packed_static_string *)dataptr;
npy_static_string sdata = {0, NULL};
int hasnull = descr->na_object != NULL;

if (ss_isnull(sdata)) {
if (npy_load_string(psdata, &sdata)) {
if (hasnull) {
PyObject *na_object = descr->na_object;
Py_INCREF(na_object);
Expand All @@ -227,9 +272,7 @@ stringdtype_getitem(StringDTypeObject *descr, char **dataptr)
}
}
else {
char *data = sdata->buf;
size_t len = sdata->len;
val_obj = PyUnicode_FromStringAndSize(data, len);
val_obj = PyUnicode_FromStringAndSize(sdata.buf, sdata.size);
if (val_obj == NULL) {
return NULL;
}
Expand All @@ -254,7 +297,7 @@ stringdtype_getitem(StringDTypeObject *descr, char **dataptr)
npy_bool
nonzero(void *data, void *NPY_UNUSED(arr))
{
return ((ss *)data)->len != 0;
return npy_string_size((npy_packed_static_string *)data) != 0;
}

// Implementation of PyArray_CompareFunc.
Expand All @@ -278,11 +321,13 @@ _compare(void *a, void *b, StringDTypeObject *descr)
return 0;
}
}
const ss *default_string = &descr->default_string;
const ss *ss_a = (ss *)a;
const ss *ss_b = (ss *)b;
int a_is_null = ss_isnull(ss_a);
int b_is_null = ss_isnull(ss_b);
npy_static_string *default_string = &descr->default_string;
const npy_packed_static_string *ps_a = (npy_packed_static_string *)a;
npy_static_string s_a = {0, NULL};
int a_is_null = npy_load_string(ps_a, &s_a);
const npy_packed_static_string *ps_b = (npy_packed_static_string *)b;
npy_static_string s_b = {0, NULL};
int b_is_null = npy_load_string(ps_b, &s_b);
if (NPY_UNLIKELY(a_is_null || b_is_null)) {
if (hasnull && !has_string_na) {
if (has_nan_na) {
Expand All @@ -303,22 +348,22 @@ _compare(void *a, void *b, StringDTypeObject *descr)
}
else {
if (a_is_null) {
ss_a = default_string;
s_a = *default_string;
}
if (b_is_null) {
ss_b = default_string;
s_b = *default_string;
}
}
}
return sscmp(ss_a, ss_b);
return npy_string_cmp(&s_a, &s_b);
}

// PyArray_ArgFunc
// The max element is the one with the highest unicode code point.
int
argmax(void *data, npy_intp n, npy_intp *max_ind, void *arr)
{
ss *dptr = (ss *)data;
npy_packed_static_string *dptr = (npy_packed_static_string *)data;
*max_ind = 0;
for (int i = 1; i < n; i++) {
if (compare(&dptr[i], &dptr[*max_ind], arr) > 0) {
Expand All @@ -333,7 +378,7 @@ argmax(void *data, npy_intp n, npy_intp *max_ind, void *arr)
int
argmin(void *data, npy_intp n, npy_intp *min_ind, void *arr)
{
ss *dptr = (ss *)data;
npy_packed_static_string *dptr = (npy_packed_static_string *)data;
*min_ind = 0;
for (int i = 1; i < n; i++) {
if (compare(&dptr[i], &dptr[*min_ind], arr) < 0) {
Expand All @@ -358,8 +403,8 @@ stringdtype_clear_loop(void *NPY_UNUSED(traverse_context),
{
while (size--) {
if (data != NULL) {
ssfree((ss *)data);
memset(data, 0, sizeof(ss));
npy_string_free((npy_packed_static_string *)data);
memset(data, 0, sizeof(npy_packed_static_string));
}
data += stride;
}
Expand Down Expand Up @@ -388,9 +433,7 @@ stringdtype_fill_zero_loop(void *NPY_UNUSED(traverse_context),
NpyAuxData *NPY_UNUSED(auxdata))
{
while (size--) {
if (ssnewlen("", 0, (ss *)(data)) < 0) {
return -1;
}
*(npy_packed_static_string *)(data) = *NPY_EMPTY_STRING;
data += stride;
}
return 0;
Expand Down Expand Up @@ -538,6 +581,9 @@ stringdtype_new(PyTypeObject *NPY_UNUSED(cls), PyObject *args, PyObject *kwds)
static void
stringdtype_dealloc(StringDTypeObject *self)
{
Py_XDECREF(self->na_object);
npy_string_free(&self->packed_default_string);
npy_string_free(&self->packed_na_name);
PyArrayDescr_Type.tp_dealloc((PyObject *)self);
}

Expand Down
5 changes: 4 additions & 1 deletion stringdtype/stringdtype/src/dtype.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ typedef struct {
int coerce;
int has_nan_na;
int has_string_na;
ss default_string;
npy_static_string default_string;
npy_packed_static_string packed_default_string;
npy_static_string na_name;
npy_packed_static_string packed_na_name;
} StringDTypeObject;

typedef struct {
Expand Down
9 changes: 6 additions & 3 deletions stringdtype/stringdtype/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,11 @@ _memory_usage(PyObject *NPY_UNUSED(self), PyObject *obj)
npy_intp count = *innersizeptr;

while (count--) {
// +1 byte for the null terminator
memory_usage += ((ss *)in)->len + 1;
size_t size = npy_string_size(((npy_packed_static_string *)in));
// FIXME: add a way for a string to report its heap size usage
if (size > (sizeof(npy_static_string) - 1)) {
memory_usage += size;
}
in += stride;
}

Expand All @@ -75,7 +78,7 @@ _memory_usage(PyObject *NPY_UNUSED(self), PyObject *obj)
static PyMethodDef string_methods[] = {
{"_memory_usage", _memory_usage, METH_O,
"get memory usage for an array"},
{NULL},
{NULL, NULL, 0, NULL},
};

static struct PyModuleDef moduledef = {
Expand Down
Loading