diff --git a/pandas/_libs/src/datetime/date_conversions.c b/pandas/_libs/src/datetime/date_conversions.c deleted file mode 100644 index 99081746b2c97..0000000000000 --- a/pandas/_libs/src/datetime/date_conversions.c +++ /dev/null @@ -1,96 +0,0 @@ -/* -Copyright (c) 2020, PyData Development Team -All rights reserved. -Distributed under the terms of the BSD Simplified License. -The full license is in the LICENSE file, distributed with this software. -*/ - -// Conversion routines that are useful for serialization, -// but which don't interact with JSON objects directly - -#include "pandas/datetime/date_conversions.h" -#include "pandas/vendored/numpy/datetime/np_datetime.h" -#include "pandas/vendored/numpy/datetime/np_datetime_strings.h" - -/* - * Function: scaleNanosecToUnit - * ----------------------------- - * - * Scales an integer value representing time in nanoseconds to provided unit. - * - * Mutates the provided value directly. Returns 0 on success, non-zero on error. - */ -int scaleNanosecToUnit(int64_t *value, NPY_DATETIMEUNIT unit) { - switch (unit) { - case NPY_FR_ns: - break; - case NPY_FR_us: - *value /= 1000LL; - break; - case NPY_FR_ms: - *value /= 1000000LL; - break; - case NPY_FR_s: - *value /= 1000000000LL; - break; - default: - return -1; - } - - return 0; -} - -/* Converts the int64_t representation of a datetime to ISO; mutates len */ -char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit, - NPY_DATETIMEUNIT base, size_t *len) { - npy_datetimestruct dts; - int ret_code; - - pandas_datetime_to_datetimestruct(value, valueUnit, &dts); - - *len = (size_t)get_datetime_iso_8601_strlen(0, base); - char *result = PyObject_Malloc(*len); - - if (result == NULL) { - PyErr_NoMemory(); - return NULL; - } - // datetime64 is always naive - ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base); - if (ret_code != 0) { - PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); - PyObject_Free(result); - } - - // Note that get_datetime_iso_8601_strlen just gives a generic size - // for ISO string conversion, not the actual size used - *len = strlen(result); - return result; -} - -/* Converts the int64_t representation of a duration to ISO; mutates len */ -char *int64ToIsoDuration(int64_t value, size_t *len) { - pandas_timedeltastruct tds; - int ret_code; - - pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); - - // Max theoretical length of ISO Duration with 64 bit day - // as the largest unit is 70 characters + 1 for a null terminator - char *result = PyObject_Malloc(71); - if (result == NULL) { - PyErr_NoMemory(); - return NULL; - } - - ret_code = make_iso_8601_timedelta(&tds, result, len); - if (ret_code == -1) { - PyErr_SetString(PyExc_ValueError, - "Could not convert timedelta value to string"); - PyObject_Free(result); - return NULL; - } - - return result; -} diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c deleted file mode 100644 index addf9c2939133..0000000000000 --- a/pandas/_libs/src/datetime/pd_datetime.c +++ /dev/null @@ -1,305 +0,0 @@ -/* - -Copyright (c) 2016, PyData Development Team -All rights reserved. - -Distributed under the terms of the BSD Simplified License. - -The full license is in the LICENSE file, distributed with this software. - -Copyright (c) 2005-2011, NumPy Developers -All rights reserved. - -This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt - -*/ - -#define _PANDAS_DATETIME_IMPL - -#define PY_SSIZE_T_CLEAN -#include - -#include "datetime.h" -/* Need to import_array for np_datetime.c (for NumPy 1.x support only) */ -#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY -#include "numpy/ndarrayobject.h" -#include "pandas/datetime/pd_datetime.h" -#include "pandas/portable.h" - -static void pandas_datetime_destructor(PyObject *op) { - void *ptr = PyCapsule_GetPointer(op, PandasDateTime_CAPSULE_NAME); - PyMem_Free(ptr); -} - -/* - * - * Converts a Python datetime.datetime or datetime.date - * object into a NumPy npy_datetimestruct. Uses tzinfo (if present) - * to convert to UTC time. - * - * The following implementation just asks for attributes, and thus - * supports datetime duck typing. The tzinfo time zone conversion - * requires this style of access as well. - * - * Returns -1 on error, 0 on success, and 1 (with no error set) - * if obj doesn't have the needed date or datetime attributes. - */ -static int convert_pydatetime_to_datetimestruct(PyObject *dtobj, - npy_datetimestruct *out) { - // Assumes that obj is a valid datetime object - PyObject *tmp; - PyObject *obj = (PyObject *)dtobj; - - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); - out->month = 1; - out->day = 1; - - tmp = PyObject_GetAttrString(obj, "year"); - if (tmp == NULL) - return -1; - out->year = PyLong_AsLong(tmp); - Py_DECREF(tmp); - - tmp = PyObject_GetAttrString(obj, "month"); - if (tmp == NULL) - return -1; - out->month = PyLong_AsLong(tmp); - Py_DECREF(tmp); - - tmp = PyObject_GetAttrString(obj, "day"); - if (tmp == NULL) - return -1; - out->day = PyLong_AsLong(tmp); - Py_DECREF(tmp); - - // TODO(anyone): If we can get PyDateTime_IMPORT to work, we could use - // PyDateTime_Check here, and less verbose attribute lookups. - - /* Check for time attributes (if not there, return success as a date) */ - if (!PyObject_HasAttrString(obj, "hour") || - !PyObject_HasAttrString(obj, "minute") || - !PyObject_HasAttrString(obj, "second") || - !PyObject_HasAttrString(obj, "microsecond")) { - return 0; - } - - tmp = PyObject_GetAttrString(obj, "hour"); - if (tmp == NULL) - return -1; - out->hour = PyLong_AsLong(tmp); - Py_DECREF(tmp); - - tmp = PyObject_GetAttrString(obj, "minute"); - if (tmp == NULL) - return -1; - out->min = PyLong_AsLong(tmp); - Py_DECREF(tmp); - - tmp = PyObject_GetAttrString(obj, "second"); - if (tmp == NULL) - return -1; - out->sec = PyLong_AsLong(tmp); - Py_DECREF(tmp); - - tmp = PyObject_GetAttrString(obj, "microsecond"); - if (tmp == NULL) - return -1; - out->us = PyLong_AsLong(tmp); - Py_DECREF(tmp); - - if (PyObject_HasAttrString(obj, "tzinfo")) { - PyObject *offset = extract_utc_offset(obj); - /* Apply the time zone offset if datetime obj is tz-aware */ - if (offset != NULL) { - if (offset == Py_None) { - Py_DECREF(offset); - return 0; - } - PyObject *tmp_int; - int seconds_offset, minutes_offset; - /* - * The timedelta should have a function "total_seconds" - * which contains the value we want. - */ - tmp = PyObject_CallMethod(offset, "total_seconds", ""); - Py_DECREF(offset); - if (tmp == NULL) { - return -1; - } - tmp_int = PyNumber_Long(tmp); - if (tmp_int == NULL) { - Py_DECREF(tmp); - return -1; - } - seconds_offset = PyLong_AsLong(tmp_int); - if (seconds_offset == -1 && PyErr_Occurred()) { - Py_DECREF(tmp_int); - Py_DECREF(tmp); - return -1; - } - Py_DECREF(tmp_int); - Py_DECREF(tmp); - - /* Convert to a minutes offset and apply it */ - minutes_offset = seconds_offset / 60; - - add_minutes_to_datetimestruct(out, -minutes_offset); - } - } - - return 0; -} - -// Converts a Python object representing a Date / Datetime to ISO format -// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z -// while base="ns" yields "2020-01-01T00:00:00.000000000Z" -// len is mutated to save the length of the returned string -static char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, - size_t *len) { - npy_datetimestruct dts; - int ret; - - ret = convert_pydatetime_to_datetimestruct(obj, &dts); - if (ret != 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, - "Could not convert PyDateTime to numpy datetime"); - } - return NULL; - } - - *len = (size_t)get_datetime_iso_8601_strlen(0, base); - char *result = PyObject_Malloc(*len); - // Check to see if PyDateTime has a timezone. - // Don't convert to UTC if it doesn't. - int is_tz_aware = 0; - if (PyObject_HasAttrString(obj, "tzinfo")) { - PyObject *offset = extract_utc_offset(obj); - if (offset == NULL) { - PyObject_Free(result); - return NULL; - } - is_tz_aware = offset != Py_None; - Py_DECREF(offset); - } - ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base); - - if (ret != 0) { - PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); - PyObject_Free(result); - return NULL; - } - - // Note that get_datetime_iso_8601_strlen just gives a generic size - // for ISO string conversion, not the actual size used - *len = strlen(result); - return result; -} - -// Convert a Python Date/Datetime to Unix epoch with resolution base -static npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) { - npy_datetimestruct dts; - int ret; - - ret = convert_pydatetime_to_datetimestruct(dt, &dts); - if (ret != 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, - "Could not convert PyDateTime to numpy datetime"); - - return -1; - } - } - - int64_t npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); - if (scaleNanosecToUnit(&npy_dt, base) == -1) { - PyErr_Format(PyExc_ValueError, - "Call to scaleNanosecToUnit with value %" NPY_DATETIME_FMT - " and base %d failed", - npy_dt, base); - - return -1; - } - return npy_dt; -} - -/* Initializes and exposes a customer datetime C-API from the pandas library - * by creating a PyCapsule that stores function pointers, which can be accessed - * later by other C code or Cython code that imports the capsule. - */ -static int pandas_datetime_exec(PyObject *Py_UNUSED(module)) { - PyDateTime_IMPORT; - PandasDateTime_CAPI *capi = PyMem_Malloc(sizeof(PandasDateTime_CAPI)); - if (capi == NULL) { - PyErr_NoMemory(); - return -1; - } - capi->npy_datetimestruct_to_datetime = npy_datetimestruct_to_datetime; - capi->scaleNanosecToUnit = scaleNanosecToUnit; - capi->int64ToIso = int64ToIso; - capi->PyDateTimeToIso = PyDateTimeToIso; - capi->PyDateTimeToEpoch = PyDateTimeToEpoch; - capi->int64ToIsoDuration = int64ToIsoDuration; - capi->pandas_datetime_to_datetimestruct = pandas_datetime_to_datetimestruct; - capi->pandas_timedelta_to_timedeltastruct = - pandas_timedelta_to_timedeltastruct; - capi->convert_pydatetime_to_datetimestruct = - convert_pydatetime_to_datetimestruct; - capi->cmp_npy_datetimestruct = cmp_npy_datetimestruct; - capi->get_datetime_metadata_from_dtype = get_datetime_metadata_from_dtype; - capi->parse_iso_8601_datetime = parse_iso_8601_datetime; - capi->get_datetime_iso_8601_strlen = get_datetime_iso_8601_strlen; - capi->make_iso_8601_datetime = make_iso_8601_datetime; - capi->make_iso_8601_timedelta = make_iso_8601_timedelta; - - PyObject *capsule = PyCapsule_New(capi, PandasDateTime_CAPSULE_NAME, - pandas_datetime_destructor); - if (capsule == NULL) { - PyMem_Free(capi); - return -1; - } - - // Monkeypatch the top level pandas module to have an attribute for the - // C-API. This is required because Python capsules do not support setting - // this attribute on anything but the top level package. Ideally not - // done when cpython gh-6898 gets implemented - PyObject *pandas = PyImport_ImportModule("pandas"); - if (!pandas) { - PyErr_SetString(PyExc_ImportError, - "pd_datetime.c could not import module pandas"); - Py_DECREF(capsule); - return -1; - } - - if (PyModule_AddObject(pandas, "_pandas_datetime_CAPI", capsule) < 0) { - Py_DECREF(capsule); - return -1; - } - - return 0; -} - -static PyModuleDef_Slot pandas_datetime_slots[] = { - {Py_mod_exec, pandas_datetime_exec}, -#if PY_VERSION_HEX >= 0x030D0000 - {Py_mod_gil, Py_MOD_GIL_NOT_USED}, -#endif - {0, NULL}, -}; - -static struct PyModuleDef pandas_datetimemodule = { - PyModuleDef_HEAD_INIT, - .m_name = "pandas._libs.pandas_datetime", - - .m_doc = "Internal module with datetime support for other extensions", - .m_size = 0, - .m_methods = NULL, - .m_slots = pandas_datetime_slots}; - -PyMODINIT_FUNC PyInit_pandas_datetime(void) { - PyDateTime_IMPORT; - import_array(); - return PyModuleDef_Init(&pandas_datetimemodule); -} diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c deleted file mode 100644 index 851901481d222..0000000000000 --- a/pandas/_libs/src/parser/io.c +++ /dev/null @@ -1,99 +0,0 @@ -/* -Copyright (c) 2016, PyData Development Team -All rights reserved. - -Distributed under the terms of the BSD Simplified License. - -The full license is in the LICENSE file, distributed with this software. -*/ - -#include "pandas/parser/io.h" - -/* - On-disk FILE, uncompressed -*/ - -void *new_rd_source(PyObject *obj) { - rd_source *rds = (rd_source *)malloc(sizeof(rd_source)); - - if (rds == NULL) { - PyErr_NoMemory(); - return NULL; - } - /* hold on to this object */ - Py_INCREF(obj); - rds->obj = obj; - rds->buffer = NULL; - rds->position = 0; - - return (void *)rds; -} - -/* - - Cleanup callbacks - - */ - -void del_rd_source(void *rds) { - Py_XDECREF(RDS(rds)->obj); - Py_XDECREF(RDS(rds)->buffer); - free(rds); -} - -/* - - IO callbacks - - */ - -char *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, - int *status, const char *encoding_errors) { - rd_source *src = RDS(source); - PyGILState_STATE state = PyGILState_Ensure(); - - /* delete old object */ - Py_XDECREF(src->buffer); - src->buffer = NULL; - PyObject *args = Py_BuildValue("(i)", nbytes); - - PyObject *func = PyObject_GetAttrString(src->obj, "read"); - - /* Note: PyObject_CallObject requires the GIL */ - PyObject *result = PyObject_CallObject(func, args); - Py_XDECREF(args); - Py_XDECREF(func); - - if (result == NULL) { - PyGILState_Release(state); - *bytes_read = 0; - *status = CALLING_READ_FAILED; - return NULL; - } else if (!PyBytes_Check(result)) { - PyObject *tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors); - Py_DECREF(result); - if (tmp == NULL) { - PyGILState_Release(state); - return NULL; - } - result = tmp; - } - - const size_t length = PySequence_Length(result); - - if (length == 0) - *status = REACHED_EOF; - else - *status = 0; - - /* hang on to the Python object */ - src->buffer = result; - char *retval = PyBytes_AsString(result); - - PyGILState_Release(state); - - /* TODO: more error handling */ - *bytes_read = length; - - return retval; -} diff --git a/pandas/_libs/src/parser/pd_parser.c b/pandas/_libs/src/parser/pd_parser.c deleted file mode 100644 index 51cdf071a15cf..0000000000000 --- a/pandas/_libs/src/parser/pd_parser.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - -Copyright (c) 2023, PyData Development Team -All rights reserved. - -Distributed under the terms of the BSD Simplified License. - -*/ -#define _PANDAS_PARSER_IMPL - -#include "pandas/parser/pd_parser.h" -#include "pandas/parser/io.h" -#include "pandas/portable.h" - -static int to_double(char *item, double *p_value, char sci, char decimal, - int *maybe_int) { - char *p_end = NULL; - int error = 0; - - /* Switch to precise xstrtod GH 31364 */ - *p_value = - precise_xstrtod(item, &p_end, decimal, sci, '\0', 1, &error, maybe_int); - - return (error == 0) && (!*p_end); -} - -static int floatify(PyObject *str, double *result, int *maybe_int) { - char *data; - PyObject *tmp = NULL; - const char sci = 'E'; - const char dec = '.'; - - if (PyBytes_Check(str)) { - data = PyBytes_AS_STRING(str); - } else if (PyUnicode_Check(str)) { - tmp = PyUnicode_AsUTF8String(str); - if (tmp == NULL) { - return -1; - } - data = PyBytes_AS_STRING(tmp); - } else { - PyErr_SetString(PyExc_TypeError, "Invalid object type"); - return -1; - } - - const int status = to_double(data, result, sci, dec, maybe_int); - - if (!status) { - /* handle inf/-inf infinity/-infinity */ - if (strlen(data) == 3) { - if (0 == strcasecmp(data, "inf")) { - *result = HUGE_VAL; - *maybe_int = 0; - } else { - goto parsingerror; - } - } else if (strlen(data) == 4) { - if (0 == strcasecmp(data, "-inf")) { - *result = -HUGE_VAL; - *maybe_int = 0; - } else if (0 == strcasecmp(data, "+inf")) { - *result = HUGE_VAL; - *maybe_int = 0; - } else { - goto parsingerror; - } - } else if (strlen(data) == 8) { - if (0 == strcasecmp(data, "infinity")) { - *result = HUGE_VAL; - *maybe_int = 0; - } else { - goto parsingerror; - } - } else if (strlen(data) == 9) { - if (0 == strcasecmp(data, "-infinity")) { - *result = -HUGE_VAL; - *maybe_int = 0; - } else if (0 == strcasecmp(data, "+infinity")) { - *result = HUGE_VAL; - *maybe_int = 0; - } else { - goto parsingerror; - } - } else { - goto parsingerror; - } - } - - Py_XDECREF(tmp); - return 0; - -parsingerror: - PyErr_Format(PyExc_ValueError, "Unable to parse string \"%s\"", data); - Py_XDECREF(tmp); - return -1; -} - -static void pandas_parser_destructor(PyObject *op) { - void *ptr = PyCapsule_GetPointer(op, PandasParser_CAPSULE_NAME); - PyMem_Free(ptr); -} - -static int pandas_parser_exec(PyObject *Py_UNUSED(module)) { - PandasParser_CAPI *capi = PyMem_Malloc(sizeof(PandasParser_CAPI)); - if (capi == NULL) { - PyErr_NoMemory(); - return -1; - } - - capi->to_double = to_double; - capi->floatify = floatify; - capi->new_rd_source = new_rd_source; - capi->del_rd_source = del_rd_source; - capi->buffer_rd_bytes = buffer_rd_bytes; - capi->uint_state_init = uint_state_init; - capi->uint64_conflict = uint64_conflict; - capi->coliter_setup = coliter_setup; - capi->parser_new = parser_new; - capi->parser_init = parser_init; - capi->parser_free = parser_free; - capi->parser_del = parser_del; - capi->parser_add_skiprow = parser_add_skiprow; - capi->parser_set_skipfirstnrows = parser_set_skipfirstnrows; - capi->parser_set_default_options = parser_set_default_options; - capi->parser_consume_rows = parser_consume_rows; - capi->parser_trim_buffers = parser_trim_buffers; - capi->tokenize_all_rows = tokenize_all_rows; - capi->tokenize_nrows = tokenize_nrows; - capi->str_to_int64 = str_to_int64; - capi->str_to_uint64 = str_to_uint64; - capi->xstrtod = xstrtod; - capi->precise_xstrtod = precise_xstrtod; - capi->round_trip = round_trip; - capi->to_boolean = to_boolean; - - PyObject *capsule = - PyCapsule_New(capi, PandasParser_CAPSULE_NAME, pandas_parser_destructor); - if (capsule == NULL) { - PyMem_Free(capi); - return -1; - } - - // Monkeypatch the top level pandas module to have an attribute for the - // C-API. This is required because Python capsules do not support setting - // this attribute on anything but the top level package. Ideally not - // done when cpython gh-6898 gets implemented - PyObject *pandas = PyImport_ImportModule("pandas"); - if (!pandas) { - PyErr_SetString(PyExc_ImportError, - "pd_parser.c could not import module pandas"); - Py_DECREF(capsule); - return -1; - } - - if (PyModule_AddObject(pandas, "_pandas_parser_CAPI", capsule) < 0) { - Py_DECREF(capsule); - return -1; - } - - return 0; -} - -static PyModuleDef_Slot pandas_parser_slots[] = { - {Py_mod_exec, pandas_parser_exec}, -#if PY_VERSION_HEX >= 0x030D0000 - {Py_mod_gil, Py_MOD_GIL_NOT_USED}, -#endif - {0, NULL}, -}; - -static struct PyModuleDef pandas_parsermodule = { - PyModuleDef_HEAD_INIT, - .m_name = "pandas._libs.pandas_parser", - - .m_doc = "Internal module with parser support for other extensions", - .m_size = 0, - .m_methods = NULL, - .m_slots = pandas_parser_slots}; - -PyMODINIT_FUNC PyInit_pandas_parser(void) { - return PyModuleDef_Init(&pandas_parsermodule); -} diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c deleted file mode 100644 index 61e96fc835e4d..0000000000000 --- a/pandas/_libs/src/parser/tokenizer.c +++ /dev/null @@ -1,2034 +0,0 @@ -/* - -Copyright (c) 2012, Lambda Foundry, Inc., except where noted - -Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause -BSD - -See LICENSE for the license - -*/ - -/* - -Low-level ascii-file processing for pandas. Combines some elements from -Python's built-in csv module and Warren Weckesser's textreader project on -GitHub. See Python Software Foundation License and BSD licenses for these. - -*/ -#include "pandas/parser/tokenizer.h" -#include "pandas/portable.h" - -#include -#include -#include -#include - -#include "pandas/portable.h" -#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 - -void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, - int64_t start) { - // column i, starting at 0 - self->words = parser->words; - self->col = i; - self->line_start = parser->line_start + start; -} - -static void free_if_not_null(void **ptr) { - TRACE(("free_if_not_null %p\n", *ptr)) - if (*ptr != NULL) { - free(*ptr); - *ptr = NULL; - } -} - -/* - - Parser / tokenizer - -*/ - -static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity, - int64_t space, int64_t elsize, int *error) { - uint64_t cap = *capacity; - void *newbuffer = buffer; - - // Can we fit potentially nbytes tokens (+ null terminators) in the stream? - while ((length + space >= cap) && (newbuffer != NULL)) { - cap = cap ? cap << 1 : 2; - buffer = newbuffer; - newbuffer = realloc(newbuffer, elsize * cap); - } - - if (newbuffer == NULL) { - // realloc failed so don't change *capacity, set *error to errno - // and return the last good realloc'd buffer so it can be freed - *error = errno; - newbuffer = buffer; - } else { - // realloc worked, update *capacity and set *error to 0 - // sigh, multiple return values - *capacity = cap; - *error = 0; - } - return newbuffer; -} - -void parser_set_default_options(parser_t *self) { - self->decimal = '.'; - self->sci = 'E'; - - // For tokenization - self->state = START_RECORD; - - self->delimiter = ','; // XXX - self->delim_whitespace = 0; - - self->doublequote = 0; - self->quotechar = '"'; - self->escapechar = 0; - - self->lineterminator = '\0'; /* NUL->standard logic */ - - self->skipinitialspace = 0; - self->quoting = QUOTE_MINIMAL; - self->allow_embedded_newline = 1; - - self->expected_fields = -1; - self->on_bad_lines = ERROR; - - self->commentchar = '#'; - self->thousands = '\0'; - - self->skipset = NULL; - self->skipfunc = NULL; - self->skip_first_N_rows = -1; - self->skip_footer = 0; -} - -parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); } - -static void parser_clear_data_buffers(parser_t *self) { - free_if_not_null((void *)&self->stream); - free_if_not_null((void *)&self->words); - free_if_not_null((void *)&self->word_starts); - free_if_not_null((void *)&self->line_start); - free_if_not_null((void *)&self->line_fields); -} - -static void parser_cleanup(parser_t *self) { - // XXX where to put this - free_if_not_null((void *)&self->error_msg); - free_if_not_null((void *)&self->warn_msg); - - if (self->skipset != NULL) { - kh_destroy_int64((kh_int64_t *)self->skipset); - self->skipset = NULL; - } - - parser_clear_data_buffers(self); - if (self->cb_cleanup != NULL) { - self->cb_cleanup(self->source); - self->cb_cleanup = NULL; - } -} - -int parser_init(parser_t *self) { - /* - Initialize data buffers - */ - - self->stream = NULL; - self->words = NULL; - self->word_starts = NULL; - self->line_start = NULL; - self->line_fields = NULL; - self->error_msg = NULL; - self->warn_msg = NULL; - - // token stream - self->stream = malloc(STREAM_INIT_SIZE); - if (self->stream == NULL) { - parser_cleanup(self); - return PARSER_OUT_OF_MEMORY; - } - self->stream_cap = STREAM_INIT_SIZE; - self->stream_len = 0; - - // word pointers and metadata - _Static_assert(STREAM_INIT_SIZE / 10 > 0, - "STREAM_INIT_SIZE must be defined and >= 10"); - const int64_t sz = STREAM_INIT_SIZE / 10; - self->words = malloc(sz * sizeof(char *)); - self->word_starts = malloc(sz * sizeof(int64_t)); - self->max_words_cap = sz; - self->words_cap = sz; - self->words_len = 0; - - // line pointers and metadata - self->line_start = malloc(sz * sizeof(int64_t)); - - self->line_fields = malloc(sz * sizeof(int64_t)); - - self->lines_cap = sz; - self->lines = 0; - self->file_lines = 0; - - if (self->stream == NULL || self->words == NULL || - self->word_starts == NULL || self->line_start == NULL || - self->line_fields == NULL) { - parser_cleanup(self); - - return PARSER_OUT_OF_MEMORY; - } - - /* amount of bytes buffered */ - self->datalen = 0; - self->datapos = 0; - - self->line_start[0] = 0; - self->line_fields[0] = 0; - - self->pword_start = self->stream; - self->word_start = 0; - - self->state = START_RECORD; - - self->error_msg = NULL; - self->warn_msg = NULL; - - self->commentchar = '\0'; - - return 0; -} - -void parser_free(parser_t *self) { - // opposite of parser_init - parser_cleanup(self); -} - -void parser_del(parser_t *self) { free(self); } - -static int make_stream_space(parser_t *self, size_t nbytes) { - // Can we fit potentially nbytes tokens (+ null terminators) in the stream? - - /* - TOKEN STREAM - */ - - int status; - char *orig_ptr = (void *)self->stream; - TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", - nbytes)) - self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, - &self->stream_cap, nbytes * 2, 1, &status); - TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, " - "self->stream_cap=%zu, status=%zu\n", - self->stream, self->stream_len, self->stream_cap, status)) - - if (status != 0) { - return PARSER_OUT_OF_MEMORY; - } - - // realloc sets errno when moving buffer? - if (self->stream != orig_ptr) { - self->pword_start = self->stream + self->word_start; - - for (uint64_t i = 0; i < self->words_len; ++i) { - self->words[i] = self->stream + self->word_starts[i]; - } - } - - /* - WORD VECTORS - */ - - const uint64_t words_cap = self->words_cap; - - /** - * If we are reading in chunks, we need to be aware of the maximum number - * of words we have seen in previous chunks (self->max_words_cap), so - * that way, we can properly allocate when reading subsequent ones. - * - * Otherwise, we risk a buffer overflow if we mistakenly under-allocate - * just because a recent chunk did not have as many words. - */ - const uint64_t length = self->words_len + nbytes < self->max_words_cap - ? self->max_words_cap - nbytes - 1 - : self->words_len; - - self->words = - (char **)grow_buffer((void *)self->words, length, &self->words_cap, - nbytes, sizeof(char *), &status); - TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " - "%d)\n", - self->words_len, self->words_cap, nbytes, status)) - if (status != 0) { - return PARSER_OUT_OF_MEMORY; - } - - // realloc took place - if (words_cap != self->words_cap) { - TRACE(("make_stream_space: cap != self->words_cap, nbytes = %d, " - "self->words_cap=%d\n", - nbytes, self->words_cap)) - int64_t *newptr = (int64_t *)realloc(self->word_starts, - sizeof(int64_t) * self->words_cap); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->word_starts = newptr; - } - } - - /* - LINE VECTORS - */ - const uint64_t lines_cap = self->lines_cap; - self->line_start = (int64_t *)grow_buffer((void *)self->line_start, - self->lines + 1, &self->lines_cap, - nbytes, sizeof(int64_t), &status); - TRACE( - ("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", - self->lines + 1, self->lines_cap, nbytes, status)) - if (status != 0) { - return PARSER_OUT_OF_MEMORY; - } - - // realloc took place - if (lines_cap != self->lines_cap) { - TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) - int64_t *newptr = (int64_t *)realloc(self->line_fields, - sizeof(int64_t) * self->lines_cap); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->line_fields = newptr; - } - } - - return 0; -} - -static int push_char(parser_t *self, char c) { - TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", - self->stream_len + 1, c, self->stream_cap)) - if (self->stream_len >= self->stream_cap) { - TRACE(("push_char: ERROR!!! self->stream_len(%d) >= " - "self->stream_cap(%d)\n", - self->stream_len, self->stream_cap)) - const size_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, - "Buffer overflow caught - possible malformed input file.\n"); - return PARSER_OUT_OF_MEMORY; - } - self->stream[self->stream_len++] = c; - return 0; -} - -static inline int end_field(parser_t *self) { - // XXX cruft - if (self->words_len >= self->words_cap) { - TRACE(("end_field: ERROR!!! self->words_len(%zu) >= " - "self->words_cap(%zu)\n", - self->words_len, self->words_cap)) - const size_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, - "Buffer overflow caught - possible malformed input file.\n"); - return PARSER_OUT_OF_MEMORY; - } - - // null terminate token - push_char(self, '\0'); - - // set pointer and metadata - self->words[self->words_len] = self->pword_start; - - TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0])); - - TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start, - self->word_start, self->words_len + 1)) - - self->word_starts[self->words_len] = self->word_start; - self->words_len++; - - // increment line field count - self->line_fields[self->lines]++; - - // New field begin in stream - self->pword_start = self->stream + self->stream_len; - self->word_start = self->stream_len; - - return 0; -} - -static void append_warning(parser_t *self, const char *msg) { - const int64_t length = strlen(msg); - - if (self->warn_msg == NULL) { - self->warn_msg = malloc(length + 1); - snprintf(self->warn_msg, length + 1, "%s", msg); - } else { - const int64_t ex_length = strlen(self->warn_msg); - char *newptr = (char *)realloc(self->warn_msg, ex_length + length + 1); - if (newptr != NULL) { - self->warn_msg = newptr; - snprintf(self->warn_msg + ex_length, length + 1, "%s", msg); - } - } -} - -static int end_line(parser_t *self) { - int64_t ex_fields = self->expected_fields; - int64_t fields = self->line_fields[self->lines]; - - TRACE(("end_line: Line end, nfields: %d\n", fields)); - - TRACE(("end_line: lines: %d\n", self->lines)); - if (self->lines > 0) { - if (self->expected_fields >= 0) { - ex_fields = self->expected_fields; - } else { - ex_fields = self->line_fields[self->lines - 1]; - } - } - TRACE(("end_line: ex_fields: %d\n", ex_fields)); - - if (self->state == START_FIELD_IN_SKIP_LINE || - self->state == IN_FIELD_IN_SKIP_LINE || - self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || - self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) { - TRACE(("end_line: Skipping row %d\n", self->file_lines)); - // increment file line count - self->file_lines++; - - // skip the tokens from this bad line - self->line_start[self->lines] += fields; - - // reset field count - self->line_fields[self->lines] = 0; - return 0; - } - - if (!(self->lines <= self->header_end + 1) && (fields > ex_fields) && - !(self->usecols)) { - // increment file line count - self->file_lines++; - - // skip the tokens from this bad line - self->line_start[self->lines] += fields; - - // reset field count - self->line_fields[self->lines] = 0; - - // file_lines is now the actual file line number (starting at 1) - if (self->on_bad_lines == ERROR) { - const size_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, - "Expected %" PRId64 " fields in line %" PRIu64 ", saw %" PRId64 - "\n", - ex_fields, self->file_lines, fields); - - TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); - - return -1; - } else { - // simply skip bad lines - if (self->on_bad_lines == WARN) { - // pass up error message - const size_t bufsize = 100; - char *msg = (char *)malloc(bufsize); - snprintf(msg, bufsize, - "Skipping line %" PRIu64 ": expected %" PRId64 - " fields, saw %" PRId64 "\n", - self->file_lines, ex_fields, fields); - append_warning(self, msg); - free(msg); - } - } - } else { - // missing trailing delimiters - if ((self->lines >= self->header_end + 1) && fields < ex_fields) { - // might overrun the buffer when closing fields - if (make_stream_space(self, ex_fields - fields) < 0) { - const size_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, "out of memory"); - return -1; - } - - while (fields < ex_fields) { - end_field(self); - fields++; - } - } - - // increment both line counts - self->file_lines++; - self->lines++; - - // good line, set new start point - if (self->lines >= self->lines_cap) { - TRACE(("end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", - self->lines, self->lines_cap)) - const size_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, - "Buffer overflow caught - " - "possible malformed input file.\n"); - return PARSER_OUT_OF_MEMORY; - } - self->line_start[self->lines] = - (self->line_start[self->lines - 1] + fields); - - TRACE(("end_line: new line start: %d\n", self->line_start[self->lines])); - - // new line start with 0 fields - self->line_fields[self->lines] = 0; - } - - TRACE(("end_line: Finished line, at %d\n", self->lines)); - - return 0; -} - -int parser_add_skiprow(parser_t *self, int64_t row) { - khiter_t k; - kh_int64_t *set; - int ret = 0; - - if (self->skipset == NULL) { - self->skipset = (void *)kh_init_int64(); - } - - set = (kh_int64_t *)self->skipset; - - k = kh_put_int64(set, row, &ret); - set->keys[k] = row; - - return 0; -} - -void parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { - // self->file_lines is zero based so subtract 1 from nrows - if (nrows > 0) { - self->skip_first_N_rows = nrows - 1; - } -} - -static int parser_buffer_bytes(parser_t *self, size_t nbytes, - const char *encoding_errors) { - int status; - size_t bytes_read; - - status = 0; - self->datapos = 0; - self->data = - self->cb_io(self->source, nbytes, &bytes_read, &status, encoding_errors); - TRACE( - ("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", - nbytes, bytes_read, status)); - self->datalen = bytes_read; - - if (status != REACHED_EOF && self->data == NULL) { - const size_t bufsize = 200; - self->error_msg = malloc(bufsize); - - if (status == CALLING_READ_FAILED) { - snprintf(self->error_msg, bufsize, - "Calling read(nbytes) on source failed. " - "Try engine='python'."); - } else { - snprintf(self->error_msg, bufsize, "Unknown error in IO callback"); - } - return -1; - } - - TRACE(("datalen: %d\n", self->datalen)); - - return status; -} - -/* - - Tokenization macros and state machine code - -*/ - -#define PUSH_CHAR(c) \ - TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ - c, slen, self->stream_cap, self->stream_len)) \ - if (slen >= self->stream_cap) { \ - TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ - self->stream_cap)) \ - const size_t bufsize = 100; \ - self->error_msg = malloc(bufsize); \ - snprintf(self->error_msg, bufsize, \ - "Buffer overflow caught - possible malformed input file.\n"); \ - return PARSER_OUT_OF_MEMORY; \ - } \ - *stream++ = c; \ - slen++; - -// This is a little bit of a hack but works for now - -#define END_FIELD() \ - self->stream_len = slen; \ - if (end_field(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; - -#define END_LINE_STATE(STATE) \ - self->stream_len = slen; \ - if (end_line(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; \ - self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ - goto linelimit; \ - } - -#define END_LINE_AND_FIELD_STATE(STATE) \ - self->stream_len = slen; \ - if (end_line(self) < 0) { \ - goto parsingerror; \ - } \ - if (end_field(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; \ - self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ - goto linelimit; \ - } - -#define END_LINE() END_LINE_STATE(START_RECORD) - -#define IS_TERMINATOR(c) (c == lineterminator) - -#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) - -// don't parse '\r' with a custom line terminator -#define IS_CARRIAGE(c) (c == carriage_symbol) - -#define IS_COMMENT_CHAR(c) (c == comment_symbol) - -#define IS_ESCAPE_CHAR(c) (c == escape_symbol) - -#define IS_SKIPPABLE_SPACE(c) \ - ((!self->delim_whitespace && c == ' ' && self->skipinitialspace)) - -// applied when in a field -#define IS_DELIMITER(c) \ - ((!delim_whitespace && c == delimiter) || (delim_whitespace && isblank(c))) - -#define _TOKEN_CLEANUP() \ - self->stream_len = slen; \ - self->datapos = i; \ - TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \ - self->datalen)); - -#define CHECK_FOR_BOM() \ - if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \ - buf += 3; \ - self->datapos += 3; \ - } - -static int skip_this_line(parser_t *self, int64_t rownum) { - if (self->skipfunc != NULL) { - PyGILState_STATE state = PyGILState_Ensure(); - PyObject *result = PyObject_CallFunction(self->skipfunc, "i", rownum); - - // Error occurred. It will be processed - // and caught at the Cython level. - const int should_skip = result == NULL ? -1 : PyObject_IsTrue(result); - - Py_XDECREF(result); - PyGILState_Release(state); - - return should_skip; - } else if (self->skipset != NULL) { - return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) != - ((kh_int64_t *)self->skipset)->n_buckets); - } else { - return (rownum <= self->skip_first_N_rows); - } -} - -static int tokenize_bytes(parser_t *self, size_t line_limit, - uint64_t start_lines) { - char *buf = self->data + self->datapos; - - const char lineterminator = - (self->lineterminator == '\0') ? '\n' : self->lineterminator; - - const int delim_whitespace = self->delim_whitespace; - const char delimiter = self->delimiter; - - // 1000 is something that couldn't fit in "char" - // thus comparing a char to it would always be "false" - const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000; - const int comment_symbol = - (self->commentchar != '\0') ? self->commentchar : 1000; - const int escape_symbol = - (self->escapechar != '\0') ? self->escapechar : 1000; - - if (make_stream_space(self, self->datalen - self->datapos) < 0) { - const size_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, "out of memory"); - return -1; - } - - char *stream = self->stream + self->stream_len; - uint64_t slen = self->stream_len; - - TRACE(("%s\n", buf)); - - if (self->file_lines == 0) { - CHECK_FOR_BOM(); - } - - char c; - int64_t i; - for (i = self->datapos; i < self->datalen; ++i) { - // next character in file - c = *buf++; - - TRACE(("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, " - "state %d\n", - i, c, self->file_lines + 1, self->line_fields[self->lines], - self->state)); - - switch (self->state) { - case START_FIELD_IN_SKIP_LINE: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else if (IS_DELIMITER(c)) { - // Do nothing, we're starting a new field again. - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - break; - - case IN_FIELD_IN_SKIP_LINE: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_DELIMITER(c)) { - self->state = START_FIELD_IN_SKIP_LINE; - } - break; - - case IN_QUOTED_FIELD_IN_SKIP_LINE: - if (IS_QUOTE(c)) { - if (self->doublequote) { - self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - } - break; - - case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: - if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_DELIMITER(c)) { - self->state = START_FIELD_IN_SKIP_LINE; - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - break; - - case WHITESPACE_LINE: - if (IS_TERMINATOR(c)) { - self->file_lines++; - self->state = START_RECORD; - break; - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - break; - } else if (!self->delim_whitespace) { - if (isblank(c) && c != self->delimiter) { - } else { // backtrack - // use i + 1 because buf has been incremented but not i - do { - --buf; - --i; - } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf)); - - // reached a newline rather than the beginning - if (IS_TERMINATOR(*buf)) { - ++buf; // move pointer to first char after newline - ++i; - } - self->state = START_FIELD; - } - break; - } - // fall through - - case EAT_WHITESPACE: - if (IS_TERMINATOR(c)) { - END_LINE(); - self->state = START_RECORD; - break; - } else if (IS_CARRIAGE(c)) { - self->state = EAT_CRNL; - break; - } else if (IS_COMMENT_CHAR(c)) { - self->state = EAT_COMMENT; - break; - } else if (!isblank(c)) { - self->state = START_FIELD; - PD_FALLTHROUGH; // fall through to subsequent state - } else { - // if whitespace char, keep slurping - break; - } - - case START_RECORD: { - // start of record - const int should_skip = skip_this_line(self, self->file_lines); - - if (should_skip == -1) { - goto parsingerror; - } else if (should_skip) { - if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - - if (IS_TERMINATOR(c)) { - END_LINE(); - } - } - break; - } else if (IS_TERMINATOR(c)) { - // \n\r possible? - if (self->skip_empty_lines) { - self->file_lines++; - } else { - END_LINE(); - } - break; - } else if (IS_CARRIAGE(c)) { - if (self->skip_empty_lines) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else { - self->state = EAT_CRNL; - } - break; - } else if (IS_COMMENT_CHAR(c)) { - self->state = EAT_LINE_COMMENT; - break; - } else if (isblank(c)) { - if (self->delim_whitespace) { - if (self->skip_empty_lines) { - self->state = WHITESPACE_LINE; - } else { - self->state = EAT_WHITESPACE; - } - break; - } else if (c != self->delimiter && self->skip_empty_lines) { - self->state = WHITESPACE_LINE; - break; - } - } - - // normal character - fall through - // to handle as START_FIELD - self->state = START_FIELD; - PD_FALLTHROUGH; - } - case START_FIELD: - // expecting field - if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else if (IS_QUOTE(c)) { - // start quoted field - self->state = IN_QUOTED_FIELD; - } else if (IS_ESCAPE_CHAR(c)) { - // possible escaped character - self->state = ESCAPED_CHAR; - } else if (IS_SKIPPABLE_SPACE(c)) { - // ignore space at start of field - } else if (IS_DELIMITER(c)) { - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - // save empty field - END_FIELD(); - } - } else if (IS_COMMENT_CHAR(c)) { - END_FIELD(); - self->state = EAT_COMMENT; - } else { - // begin new unquoted field - PUSH_CHAR(c); - self->state = IN_FIELD; - } - break; - - case ESCAPED_CHAR: - PUSH_CHAR(c); - self->state = IN_FIELD; - break; - - case EAT_LINE_COMMENT: - if (IS_TERMINATOR(c)) { - self->file_lines++; - self->state = START_RECORD; - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - break; - - case IN_FIELD: - // in unquoted field - if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else if (IS_ESCAPE_CHAR(c)) { - // possible escaped character - self->state = ESCAPED_CHAR; - } else if (IS_DELIMITER(c)) { - // end of field - end of line not reached yet - END_FIELD(); - - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - self->state = START_FIELD; - } - } else if (IS_COMMENT_CHAR(c)) { - END_FIELD(); - self->state = EAT_COMMENT; - } else { - // normal character - save in field - PUSH_CHAR(c); - } - break; - - case IN_QUOTED_FIELD: - // in quoted field - if (IS_ESCAPE_CHAR(c)) { - // possible escape character - self->state = ESCAPE_IN_QUOTED_FIELD; - } else if (IS_QUOTE(c)) { - if (self->doublequote) { - // double quote - " represented by "" - self->state = QUOTE_IN_QUOTED_FIELD; - } else { - // end of quote part of field - self->state = IN_FIELD; - } - } else { - // normal character - save in field - PUSH_CHAR(c); - } - break; - - case ESCAPE_IN_QUOTED_FIELD: - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - break; - - case QUOTE_IN_QUOTED_FIELD: - // double quote - seen a quote in an quoted field - if (IS_QUOTE(c)) { - // save "" as " - - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - } else if (IS_DELIMITER(c)) { - // end of field - end of line not reached yet - END_FIELD(); - - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - self->state = START_FIELD; - } - } else if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else { - PUSH_CHAR(c); - self->state = IN_FIELD; - } - break; - - case EAT_COMMENT: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->state = EAT_CRNL; - } - break; - - // only occurs with non-custom line terminator, - // which is why we directly check for '\n' - case EAT_CRNL: - if (c == '\n') { - END_LINE(); - } else if (IS_DELIMITER(c)) { - if (self->delim_whitespace) { - END_LINE_STATE(EAT_WHITESPACE); - } else { - // Handle \r-delimited files - END_LINE_AND_FIELD_STATE(START_FIELD); - } - } else { - if (self->delim_whitespace) { - /* XXX - * first character of a new record--need to back up and - * reread - * to handle properly... - */ - i--; - buf--; // back up one character (HACK!) - END_LINE_STATE(START_RECORD); - } else { - // \r line terminator - // UGH. we don't actually want - // to consume the token. fix this later - self->stream_len = slen; - if (end_line(self) < 0) { - goto parsingerror; - } - - stream = self->stream + self->stream_len; - slen = self->stream_len; - self->state = START_RECORD; - - --i; - buf--; // let's try this character again (HACK!) - if (line_limit > 0 && self->lines == start_lines + line_limit) { - goto linelimit; - } - } - } - break; - - // only occurs with non-custom line terminator, - // which is why we directly check for '\n' - case EAT_CRNL_NOP: // inside an ignored comment line - self->state = START_RECORD; - // \r line terminator -- parse this character again - if (c != '\n' && !IS_DELIMITER(c)) { - --i; - --buf; - } - break; - default: - break; - } - } - - _TOKEN_CLEANUP(); - - TRACE(("Finished tokenizing input\n")) - - return 0; - -parsingerror: - i++; - _TOKEN_CLEANUP(); - - return -1; - -linelimit: - i++; - _TOKEN_CLEANUP(); - - return 0; -} - -static int parser_handle_eof(parser_t *self) { - const size_t bufsize = 100; - - TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) - - if (self->datalen != 0) - return -1; - - switch (self->state) { - case START_RECORD: - case WHITESPACE_LINE: - case EAT_CRNL_NOP: - case EAT_LINE_COMMENT: - return 0; - - case ESCAPE_IN_QUOTED_FIELD: - case IN_QUOTED_FIELD: - self->error_msg = (char *)malloc(bufsize); - snprintf(self->error_msg, bufsize, - "EOF inside string starting at row %" PRIu64, self->file_lines); - return -1; - - case ESCAPED_CHAR: - self->error_msg = (char *)malloc(bufsize); - snprintf(self->error_msg, bufsize, "EOF following escape character"); - return -1; - - case IN_FIELD: - case START_FIELD: - case QUOTE_IN_QUOTED_FIELD: - if (end_field(self) < 0) - return -1; - break; - - default: - break; - } - - if (end_line(self) < 0) - return -1; - else - return 0; -} - -int parser_consume_rows(parser_t *self, size_t nrows) { - if (nrows > self->lines) { - nrows = self->lines; - } - - /* do nothing */ - if (nrows == 0) - return 0; - - /* cannot guarantee that nrows + 1 has been observed */ - const int64_t word_deletions = - self->line_start[nrows - 1] + self->line_fields[nrows - 1]; - - /* if word_deletions == 0 (i.e. this case) then char_count must - * be 0 too, as no data needs to be skipped */ - const uint64_t char_count = - word_deletions >= 1 ? (self->word_starts[word_deletions - 1] + - strlen(self->words[word_deletions - 1]) + 1) - : 0; - - TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, - char_count)); - - /* move stream, only if something to move */ - if (char_count < self->stream_len) { - memmove(self->stream, (self->stream + char_count), - self->stream_len - char_count); - } - /* buffer counts */ - self->stream_len -= char_count; - - /* move token metadata */ - // Note: We should always have words_len < word_deletions, so this - // subtraction will remain appropriately-typed. - int64_t offset; - for (uint64_t i = 0; i < self->words_len - word_deletions; ++i) { - offset = i + word_deletions; - - self->words[i] = self->words[offset] - char_count; - self->word_starts[i] = self->word_starts[offset] - char_count; - } - self->words_len -= word_deletions; - - /* move current word pointer to stream */ - self->pword_start -= char_count; - self->word_start -= char_count; - - /* move line metadata */ - // Note: We should always have self->lines - nrows + 1 >= 0, so this - // subtraction will remain appropriately-typed. - for (uint64_t i = 0; i < self->lines - nrows + 1; ++i) { - offset = i + nrows; - self->line_start[i] = self->line_start[offset] - word_deletions; - self->line_fields[i] = self->line_fields[offset]; - } - self->lines -= nrows; - - return 0; -} - -static size_t _next_pow2(size_t sz) { - size_t result = 1; - while (result < sz) - result *= 2; - return result; -} - -int parser_trim_buffers(parser_t *self) { - /* - Free memory - */ - - /** - * Before we free up space and trim, we should - * save how many words we saw when parsing, if - * it exceeds the maximum number we saw before. - * - * This is important for when we read in chunks, - * so that we can inform subsequent chunk parsing - * as to how many words we could possibly see. - */ - if (self->words_cap > self->max_words_cap) { - self->max_words_cap = self->words_cap; - } - - /* trim words, word_starts */ - size_t new_cap = _next_pow2(self->words_len) + 1; - if (new_cap < self->words_cap) { - TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); - self->words = realloc(self->words, new_cap * sizeof(char *)); - if (self->words == NULL) { - return PARSER_OUT_OF_MEMORY; - } - self->word_starts = realloc(self->word_starts, new_cap * sizeof(int64_t)); - if (self->word_starts == NULL) { - return PARSER_OUT_OF_MEMORY; - } - self->words_cap = new_cap; - } - - /* trim stream */ - new_cap = _next_pow2(self->stream_len) + 1; - TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = " - "%zu\n", - new_cap, self->stream_cap, self->lines_cap)); - if (new_cap < self->stream_cap) { - TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling " - "realloc\n")); - void *newptr = realloc(self->stream, new_cap); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - // Update the pointers in the self->words array (char **) if - // `realloc` - // moved the `self->stream` buffer. This block mirrors a similar - // block in - // `make_stream_space`. - if (self->stream != newptr) { - self->pword_start = (char *)newptr + self->word_start; - - for (uint64_t i = 0; i < self->words_len; ++i) { - self->words[i] = (char *)newptr + self->word_starts[i]; - } - } - - self->stream = newptr; - self->stream_cap = new_cap; - } - } - - /* trim line_start, line_fields */ - new_cap = _next_pow2(self->lines) + 1; - if (new_cap < self->lines_cap) { - TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - void *newptr = realloc(self->line_start, new_cap * sizeof(int64_t)); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->line_start = newptr; - } - newptr = realloc(self->line_fields, new_cap * sizeof(int64_t)); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->line_fields = newptr; - self->lines_cap = new_cap; - } - } - - return 0; -} - -/* - nrows : number of rows to tokenize (or until reach EOF) - all : tokenize all the data vs. certain number of rows - */ - -static int _tokenize_helper(parser_t *self, size_t nrows, int all, - const char *encoding_errors) { - int status = 0; - const uint64_t start_lines = self->lines; - - if (self->state == FINISHED) { - return 0; - } - - TRACE( - ("_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n", - nrows, self->datapos, self->datalen)); - - while (1) { - if (!all && self->lines - start_lines >= nrows) - break; - - if (self->datapos == self->datalen) { - status = parser_buffer_bytes(self, self->chunksize, encoding_errors); - - if (status == REACHED_EOF) { - // close out last line - status = parser_handle_eof(self); - self->state = FINISHED; - break; - } else if (status != 0) { - return status; - } - } - - TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, " - "datapos= %d\n", - self->datalen - self->datapos, self->datalen, self->datapos)); - - status = tokenize_bytes(self, nrows, start_lines); - - if (status < 0) { - // XXX - TRACE(("_tokenize_helper: Status %d returned from tokenize_bytes, " - "breaking\n", - status)); - status = -1; - break; - } - } - TRACE(("leaving tokenize_helper\n")); - return status; -} - -int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) { - return _tokenize_helper(self, nrows, 0, encoding_errors); -} - -int tokenize_all_rows(parser_t *self, const char *encoding_errors) { - return _tokenize_helper(self, -1, 1, encoding_errors); -} - -/* - * Function: to_boolean - * -------------------- - * - * Validate if item should be recognized as a boolean field. - * - * item: const char* representing parsed text - * val : pointer to a uint8_t of boolean representation - * - * If item is determined to be boolean, this method will set - * the appropriate value of val and return 0. A non-zero exit - * status means that item was not inferred to be boolean, and - * leaves the value of *val unmodified. - */ -int to_boolean(const char *item, uint8_t *val) { - if (strcasecmp(item, "TRUE") == 0) { - *val = 1; - return 0; - } else if (strcasecmp(item, "FALSE") == 0) { - *val = 0; - return 0; - } - - return -1; -} - -// --------------------------------------------------------------------------- -// Implementation of xstrtod - -// -// strtod.c -// -// Convert string to double -// -// Copyright (C) 2002 Michael Ringgaard. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// 3. Neither the name of the project nor the names of its contributors -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -// LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -// SUCH DAMAGE. -// -// ----------------------------------------------------------------------- -// Modifications by Warren Weckesser, March 2011: -// * Rename strtod() to xstrtod(). -// * Added decimal and sci arguments. -// * Skip trailing spaces. -// * Commented out the other functions. -// Modifications by Richard T Guy, August 2013: -// * Add tsep argument for thousands separator -// - -double xstrtod(const char *str, char **endptr, char decimal, char sci, - char tsep, int skip_trailing, int *error, int *maybe_int) { - const char *p = str; - if (maybe_int != NULL) - *maybe_int = 1; - // Skip leading whitespace. - while (isspace_ascii(*p)) - p++; - - // Handle optional sign. - int negative = 0; - switch (*p) { - case '-': - negative = 1; - PD_FALLTHROUGH; // Fall through to increment position. - case '+': - p++; - break; - } - - int exponent = 0; - int num_digits = 0; - int num_decimals = 0; - - // pessimistic but quick assessment, - // assuming that each decimal digit requires 4 bits to store - // TODO: C23 has UINT64_WIDTH macro that can be used at compile time - const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4; - - // Process string of digits. - unsigned int i_number = 0; - while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) { - i_number = i_number * 10 + (*p - '0'); - p++; - num_digits++; - - p += (tsep != '\0' && *p == tsep); - } - double number = i_number; - - if (num_digits > max_int_decimal_digits) { - // process what's left as double - while (isdigit_ascii(*p)) { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - - p += (tsep != '\0' && *p == tsep); - } - } - - // Process decimal part. - if (*p == decimal) { - if (maybe_int != NULL) - *maybe_int = 0; - p++; - - while (isdigit_ascii(*p)) { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - num_decimals++; - } - - exponent -= num_decimals; - } - - if (num_digits == 0) { - *error = ERANGE; - return 0.0; - } - - // Correct for sign. - if (negative) - number = -number; - - // Process an exponent string. - if (toupper_ascii(*p) == toupper_ascii(sci)) { - if (maybe_int != NULL) - *maybe_int = 0; - - // Handle optional sign. - negative = 0; - switch (*++p) { - case '-': - negative = 1; - PD_FALLTHROUGH; // Fall through to increment position. - case '+': - p++; - break; - } - - // Process string of digits. - num_digits = 0; - int n = 0; - while (isdigit_ascii(*p)) { - n = n * 10 + (*p - '0'); - num_digits++; - p++; - } - - if (negative) - exponent -= n; - else - exponent += n; - - // If no digits, after the 'e'/'E', un-consume it - if (num_digits == 0) - p--; - } - - if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) { - *error = ERANGE; - return HUGE_VAL; - } - - // Scale the result. - double p10 = 10.; - int n = exponent; - if (n < 0) - n = -n; - while (n) { - if (n & 1) { - if (exponent < 0) - number /= p10; - else - number *= p10; - } - n >>= 1; - p10 *= p10; - } - - if (number == HUGE_VAL) { - *error = ERANGE; - } - - if (skip_trailing) { - // Skip trailing whitespace. - while (isspace_ascii(*p)) - p++; - } - - if (endptr) - *endptr = (char *)p; - return number; -} - -double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, - char tsep, int skip_trailing, int *error, - int *maybe_int) { - const char *p = str; - const int max_digits = 17; - - if (maybe_int != NULL) - *maybe_int = 1; - // Cache powers of 10 in memory. - static double e[] = { - 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, - 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, - 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, - 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, - 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, - 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, - 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, - 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, - 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, - 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, - 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, - 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, - 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, - 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, - 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, - 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, - 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, - 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, - 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, - 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, - 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, - 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, - 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, - 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, - 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, - 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, - 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, - 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, - 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, - 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, - 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; - - // Skip leading whitespace. - while (isspace_ascii(*p)) - p++; - - // Handle optional sign. - int negative = 0; - switch (*p) { - case '-': - negative = 1; - PD_FALLTHROUGH; // Fall through to increment position. - case '+': - p++; - break; - } - - double number = 0.; - int exponent = 0; - int num_digits = 0; - int num_decimals = 0; - - // Process string of digits. - while (isdigit_ascii(*p)) { - if (num_digits < max_digits) { - number = number * 10. + (*p - '0'); - num_digits++; - } else { - ++exponent; - } - - p++; - p += (tsep != '\0' && *p == tsep); - } - - // Process decimal part - if (*p == decimal) { - if (maybe_int != NULL) - *maybe_int = 0; - p++; - - while (num_digits < max_digits && isdigit_ascii(*p)) { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - num_decimals++; - } - - if (num_digits >= max_digits) // Consume extra decimal digits. - while (isdigit_ascii(*p)) - ++p; - - exponent -= num_decimals; - } - - if (num_digits == 0) { - *error = ERANGE; - return 0.0; - } - - // Correct for sign. - if (negative) - number = -number; - - // Process an exponent string. - if (toupper_ascii(*p) == toupper_ascii(sci)) { - if (maybe_int != NULL) - *maybe_int = 0; - - // Handle optional sign - negative = 0; - switch (*++p) { - case '-': - negative = 1; - PD_FALLTHROUGH; // Fall through to increment position. - case '+': - p++; - break; - } - - // Process string of digits. - num_digits = 0; - int n = 0; - while (num_digits < max_digits && isdigit_ascii(*p)) { - n = n * 10 + (*p - '0'); - num_digits++; - p++; - } - - if (negative) - exponent -= n; - else - exponent += n; - - // If no digits after the 'e'/'E', un-consume it. - if (num_digits == 0) - p--; - } - - if (exponent > 308) { - *error = ERANGE; - return HUGE_VAL; - } else if (exponent > 0) { - number *= e[exponent]; - } else if (exponent < -308) { // Subnormal - if (exponent < -616) { // Prevent invalid array access. - number = 0.; - } else { - number /= e[-308 - exponent]; - number /= e[308]; - } - - } else { - number /= e[-exponent]; - } - - if (number == HUGE_VAL || number == -HUGE_VAL) - *error = ERANGE; - - if (skip_trailing) { - // Skip trailing whitespace. - while (isspace_ascii(*p)) - p++; - } - - if (endptr) - *endptr = (char *)p; - return number; -} - -/* copy a decimal number string with `decimal`, `tsep` as decimal point - and thousands separator to an equivalent c-locale decimal string (striping - `tsep`, replacing `decimal` with '.'). The returned memory should be free-d - with a call to `free`. -*/ - -static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, - char tsep) { - const char *p = s; - const size_t length = strlen(s); - char *s_copy = malloc(length + 1); - char *dst = s_copy; - // Skip leading whitespace. - while (isspace_ascii(*p)) - p++; - // Copy Leading sign - if (*p == '+' || *p == '-') { - *dst++ = *p++; - } - // Copy integer part dropping `tsep` - while (isdigit_ascii(*p)) { - *dst++ = *p++; - p += (tsep != '\0' && *p == tsep); - } - // Replace `decimal` with '.' - if (*p == decimal) { - *dst++ = '.'; - p++; - } - // Copy fractional part after decimal (if any) - while (isdigit_ascii(*p)) { - *dst++ = *p++; - } - // Copy exponent if any - if (toupper_ascii(*p) == toupper_ascii('E')) { - *dst++ = *p++; - // Copy leading exponent sign (if any) - if (*p == '+' || *p == '-') { - *dst++ = *p++; - } - // Copy exponent digits - while (isdigit_ascii(*p)) { - *dst++ = *p++; - } - } - *dst++ = '\0'; // terminate - if (endpos != NULL) - *endpos = (char *)p; - return s_copy; -} - -double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci), - char tsep, int skip_trailing, int *error, int *maybe_int) { - // 'normalize' representation to C-locale; replace decimal with '.' and - // remove thousands separator. - char *endptr; - char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); - // This is called from a nogil block in parsers.pyx - // so need to explicitly get GIL before Python calls - PyGILState_STATE gstate = PyGILState_Ensure(); - char *endpc; - const double r = PyOS_string_to_double(pc, &endpc, 0); - // PyOS_string_to_double needs to consume the whole string - if (endpc == pc + strlen(pc)) { - if (q != NULL) { - // report endptr from source string (p) - *q = endptr; - } - } else { - *error = -1; - if (q != NULL) { - // p and pc are different len due to tsep removal. Can't report - // how much it has consumed of p. Just rewind to beginning. - *q = (char *)p; // TODO(willayd): this could be undefined behavior - } - } - if (maybe_int != NULL) - *maybe_int = 0; - if (PyErr_Occurred() != NULL) - *error = -1; - else if (r == Py_HUGE_VAL) - *error = (int)Py_HUGE_VAL; - PyErr_Clear(); - - PyGILState_Release(gstate); - free(pc); - if (skip_trailing && q != NULL && *q != p) { - while (isspace_ascii(**q)) { - (*q)++; - } - } - return r; -} - -// End of xstrtod code -// --------------------------------------------------------------------------- - -void uint_state_init(uint_state *self) { - self->seen_sint = 0; - self->seen_uint = 0; - self->seen_null = 0; -} - -int uint64_conflict(uint_state *self) { - return self->seen_uint && (self->seen_sint || self->seen_null); -} - -int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, - int *error, char tsep) { - const char *p = p_item; - // Skip leading spaces. - while (isspace_ascii(*p)) { - ++p; - } - - // Handle sign. - const bool isneg = *p == '-' ? true : false; - // Handle sign. - if (isneg || (*p == '+')) { - p++; - } - - // Check that there is a first digit. - if (!isdigit_ascii(*p)) { - // Error... - *error = ERROR_NO_DIGITS; - return 0; - } - - int64_t number = 0; - if (isneg) { - // If number is greater than pre_min, at least one more digit - // can be processed without overflowing. - int dig_pre_min = -(int_min % 10); - int64_t pre_min = int_min / 10; - - // Process the digits. - char d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number > pre_min) || - ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); - d = *++p; - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } else { - while (isdigit_ascii(d)) { - if ((number > pre_min) || - ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); - d = *++p; - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } - } else { - // If number is less than pre_max, at least one more digit - // can be processed without overflowing. - int64_t pre_max = int_max / 10; - int dig_pre_max = int_max % 10; - - // Process the digits. - char d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } else { - while (isdigit_ascii(d)) { - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } - } - - // Skip trailing spaces. - while (isspace_ascii(*p)) { - ++p; - } - - // Did we use up all the characters? - if (*p) { - *error = ERROR_INVALID_CHARS; - return 0; - } - - *error = 0; - return number; -} - -uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, - uint64_t uint_max, int *error, char tsep) { - const char *p = p_item; - // Skip leading spaces. - while (isspace_ascii(*p)) { - ++p; - } - - // Handle sign. - if (*p == '-') { - state->seen_sint = 1; - *error = 0; - return 0; - } else if (*p == '+') { - p++; - } - - // Check that there is a first digit. - if (!isdigit_ascii(*p)) { - // Error... - *error = ERROR_NO_DIGITS; - return 0; - } - - // If number is less than pre_max, at least one more digit - // can be processed without overflowing. - // - // Process the digits. - uint64_t number = 0; - const uint64_t pre_max = uint_max / 10; - const uint64_t dig_pre_max = uint_max % 10; - char d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number < pre_max) || - ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } else { - while (isdigit_ascii(d)) { - if ((number < pre_max) || - ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } - - // Skip trailing spaces. - while (isspace_ascii(*p)) { - ++p; - } - - // Did we use up all the characters? - if (*p) { - *error = ERROR_INVALID_CHARS; - return 0; - } - - if (number > (uint64_t)int_max) { - state->seen_uint = 1; - } - - *error = 0; - return number; -} diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c deleted file mode 100644 index 9a022095feee9..0000000000000 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ /dev/null @@ -1,855 +0,0 @@ -/* - -Copyright (c) 2016, PyData Development Team -All rights reserved. - -Distributed under the terms of the BSD Simplified License. - -The full license is in the LICENSE file, distributed with this software. - -Copyright (c) 2005-2011, NumPy Developers -All rights reserved. - -This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt - -*/ - -// Licence at LICENSES/NUMPY_LICENSE - -#ifndef NPY_NO_DEPRECATED_API -#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#endif // NPY_NO_DEPRECATED_API - -#include "pandas/vendored/numpy/datetime/np_datetime.h" -#define NO_IMPORT_ARRAY -#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY -#include -#include -#include - -#if defined(_WIN32) -#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS -#define ENABLE_INTSAFE_SIGNED_FUNCTIONS -#endif -#include -#define checked_int64_add(a, b, res) LongLongAdd(a, b, res) -#define checked_int64_sub(a, b, res) LongLongSub(a, b, res) -#define checked_int64_mul(a, b, res) LongLongMult(a, b, res) -#else -#if defined __has_builtin -#if __has_builtin(__builtin_add_overflow) -#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) -#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) -#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) -#else -_Static_assert(0, - "Overflow checking not detected; please try a newer compiler"); -#endif -// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment -// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that -#elif __GNUC__ > 7 -#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) -#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) -#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) -#else -_Static_assert(0, "__has_builtin not detected; please try a newer compiler"); -#endif -#endif - -#define XSTR(a) STR(a) -#define STR(a) #a - -#define PD_CHECK_OVERFLOW(FUNC) \ - do { \ - if ((FUNC) != 0) { \ - PyGILState_STATE gstate = PyGILState_Ensure(); \ - PyErr_SetString(PyExc_OverflowError, \ - "Overflow occurred at " __FILE__ ":" XSTR(__LINE__)); \ - PyGILState_Release(gstate); \ - return -1; \ - } \ - } while (0) - -const int days_per_month_table[2][12] = { - {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, - {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; - -/* - * Returns 1 if the given year is a leap year, 0 otherwise. - */ -int is_leapyear(npy_int64 year) { - return (year & 0x3) == 0 && /* year % 4 == 0 */ - ((year % 100) != 0 || (year % 400) == 0); -} - -/* - * Adjusts a datetimestruct based on a minutes offset. Assumes - * the current values are valid.g - */ -void add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes) { - int isleap; - - /* MINUTES */ - dts->min += minutes; - while (dts->min < 0) { - dts->min += 60; - dts->hour--; - } - while (dts->min >= 60) { - dts->min -= 60; - dts->hour++; - } - - /* HOURS */ - while (dts->hour < 0) { - dts->hour += 24; - dts->day--; - } - while (dts->hour >= 24) { - dts->hour -= 24; - dts->day++; - } - - /* DAYS */ - if (dts->day < 1) { - dts->month--; - if (dts->month < 1) { - dts->year--; - dts->month = 12; - } - isleap = is_leapyear(dts->year); - dts->day += days_per_month_table[isleap][dts->month - 1]; - } else if (dts->day > 28) { - isleap = is_leapyear(dts->year); - if (dts->day > days_per_month_table[isleap][dts->month - 1]) { - dts->day -= days_per_month_table[isleap][dts->month - 1]; - dts->month++; - if (dts->month > 12) { - dts->year++; - dts->month = 1; - } - } - } -} - -/* - * Calculates the days offset from the 1970 epoch. - */ -npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { - int i, month; - npy_int64 year, days = 0; - const int *month_lengths; - - PD_CHECK_OVERFLOW(checked_int64_sub(dts->year, 1970, &year)); - PD_CHECK_OVERFLOW(checked_int64_mul(year, 365, &days)); - - /* Adjust for leap years */ - if (days >= 0) { - /* - * 1968 is the closest leap year before 1970. - * Exclude the current year, so add 1. - */ - PD_CHECK_OVERFLOW(checked_int64_add(year, 1, &year)); - /* Add one day for each 4 years */ - PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days)); - /* 1900 is the closest previous year divisible by 100 */ - PD_CHECK_OVERFLOW(checked_int64_add(year, 68, &year)); - /* Subtract one day for each 100 years */ - PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days)); - /* 1600 is the closest previous year divisible by 400 */ - PD_CHECK_OVERFLOW(checked_int64_add(year, 300, &year)); - /* Add one day for each 400 years */ - PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days)); - } else { - /* - * 1972 is the closest later year after 1970. - * Include the current year, so subtract 2. - */ - PD_CHECK_OVERFLOW(checked_int64_sub(year, 2, &year)); - /* Subtract one day for each 4 years */ - PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days)); - /* 2000 is the closest later year divisible by 100 */ - PD_CHECK_OVERFLOW(checked_int64_sub(year, 28, &year)); - /* Add one day for each 100 years */ - PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days)); - /* 2000 is also the closest later year divisible by 400 */ - /* Subtract one day for each 400 years */ - PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days)); - } - - month_lengths = days_per_month_table[is_leapyear(dts->year)]; - month = dts->month - 1; - - /* Add the months */ - for (i = 0; i < month; ++i) { - PD_CHECK_OVERFLOW(checked_int64_add(days, month_lengths[i], &days)); - } - - /* Add the days */ - PD_CHECK_OVERFLOW(checked_int64_add(days, dts->day - 1, &days)); - - return days; -} - -/* - * Modifies '*days_' to be the day offset within the year, - * and returns the year. - */ -static npy_int64 days_to_yearsdays(npy_int64 *days_) { - const npy_int64 days_per_400years = (400 * 365 + 100 - 4 + 1); - /* Adjust so it's relative to the year 2000 (divisible by 400) */ - npy_int64 days = (*days_) - (365 * 30 + 7); - npy_int64 year; - - /* Break down the 400 year cycle to get the year and day within the year */ - if (days >= 0) { - year = 400 * (days / days_per_400years); - days = days % days_per_400years; - } else { - year = 400 * ((days - (days_per_400years - 1)) / days_per_400years); - days = days % days_per_400years; - if (days < 0) { - days += days_per_400years; - } - } - - /* Work out the year/day within the 400 year cycle */ - if (days >= 366) { - year += 100 * ((days - 1) / (100 * 365 + 25 - 1)); - days = (days - 1) % (100 * 365 + 25 - 1); - if (days >= 365) { - year += 4 * ((days + 1) / (4 * 365 + 1)); - days = (days + 1) % (4 * 365 + 1); - if (days >= 366) { - year += (days - 1) / 365; - days = (days - 1) % 365; - } - } - } - - *days_ = days; - return year + 2000; -} - -/* - * Fills in the year, month, day in 'dts' based on the days - * offset from 1970. - */ -static void set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) { - const int *month_lengths; - int i; - - dts->year = days_to_yearsdays(&days); - month_lengths = days_per_month_table[is_leapyear(dts->year)]; - - for (i = 0; i < 12; ++i) { - if (days < month_lengths[i]) { - dts->month = i + 1; - dts->day = (npy_int32)days + 1; - return; - } else { - days -= month_lengths[i]; - } - } -} - -/* - * Compares two npy_datetimestruct objects chronologically - */ -int cmp_npy_datetimestruct(const npy_datetimestruct *a, - const npy_datetimestruct *b) { - if (a->year > b->year) { - return 1; - } else if (a->year < b->year) { - return -1; - } - - if (a->month > b->month) { - return 1; - } else if (a->month < b->month) { - return -1; - } - - if (a->day > b->day) { - return 1; - } else if (a->day < b->day) { - return -1; - } - - if (a->hour > b->hour) { - return 1; - } else if (a->hour < b->hour) { - return -1; - } - - if (a->min > b->min) { - return 1; - } else if (a->min < b->min) { - return -1; - } - - if (a->sec > b->sec) { - return 1; - } else if (a->sec < b->sec) { - return -1; - } - - if (a->us > b->us) { - return 1; - } else if (a->us < b->us) { - return -1; - } - - if (a->ps > b->ps) { - return 1; - } else if (a->ps < b->ps) { - return -1; - } - - if (a->as > b->as) { - return 1; - } else if (a->as < b->as) { - return -1; - } - - return 0; -} -/* - * Returns the offset from utc of the timezone as a timedelta. - * The caller is responsible for ensuring that the tzinfo - * attribute exists on the datetime object. - * - * If the passed object is timezone naive, Py_None is returned. - * If extraction of the offset fails, NULL is returned. - * - * NOTE: This function is not vendored from numpy. - */ -PyObject *extract_utc_offset(PyObject *obj) { - PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo"); - if (tmp == NULL) { - return NULL; - } - if (tmp != Py_None) { - PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); - if (offset == NULL) { - Py_DECREF(tmp); - return NULL; - } - return offset; - } - return tmp; -} - -static inline int scaleYearToEpoch(int64_t year, int64_t *result) { - return checked_int64_sub(year, 1970, result); -} - -static inline int scaleYearsToMonths(int64_t years, int64_t *result) { - return checked_int64_mul(years, 12, result); -} - -static inline int scaleDaysToWeeks(int64_t days, int64_t *result) { - if (days >= 0) { - *result = days / 7; - return 0; - } else { - int res; - int64_t checked_days; - if ((res = checked_int64_sub(days, 6, &checked_days))) { - return res; - } - - *result = checked_days / 7; - return 0; - } -} - -static inline int scaleDaysToHours(int64_t days, int64_t *result) { - return checked_int64_mul(days, 24, result); -} - -static inline int scaleHoursToMinutes(int64_t hours, int64_t *result) { - return checked_int64_mul(hours, 60, result); -} - -static inline int scaleMinutesToSeconds(int64_t minutes, int64_t *result) { - return checked_int64_mul(minutes, 60, result); -} - -static inline int scaleSecondsToMilliseconds(int64_t seconds, int64_t *result) { - return checked_int64_mul(seconds, 1000, result); -} - -static inline int scaleSecondsToMicroseconds(int64_t seconds, int64_t *result) { - return checked_int64_mul(seconds, 1000000, result); -} - -static inline int scaleMicrosecondsToNanoseconds(int64_t microseconds, - int64_t *result) { - return checked_int64_mul(microseconds, 1000, result); -} - -static inline int scaleMicrosecondsToPicoseconds(int64_t microseconds, - int64_t *result) { - return checked_int64_mul(microseconds, 1000000, result); -} - -static inline int64_t scalePicosecondsToFemtoseconds(int64_t picoseconds, - int64_t *result) { - return checked_int64_mul(picoseconds, 1000, result); -} - -static inline int64_t scalePicosecondsToAttoseconds(int64_t picoseconds, - int64_t *result) { - return checked_int64_mul(picoseconds, 1000000, result); -} - -/* - * Converts a datetime from a datetimestruct to a datetime based - * on a metadata unit. Returns -1 on and sets PyErr on error. - */ -npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, - const npy_datetimestruct *dts) { - if ((base == NPY_FR_Y) || (base == NPY_FR_M)) { - int64_t years; - PD_CHECK_OVERFLOW(scaleYearToEpoch(dts->year, &years)); - - if (base == NPY_FR_Y) { - return years; - } - - int64_t months; - PD_CHECK_OVERFLOW(scaleYearsToMonths(years, &months)); - - int64_t months_adder; - PD_CHECK_OVERFLOW(checked_int64_sub(dts->month, 1, &months_adder)); - PD_CHECK_OVERFLOW(checked_int64_add(months, months_adder, &months)); - - if (base == NPY_FR_M) { - return months; - } - } - - const int64_t days = get_datetimestruct_days(dts); - if (days == -1) { - PyGILState_STATE gstate = PyGILState_Ensure(); - bool did_error = PyErr_Occurred() == NULL ? false : true; - PyGILState_Release(gstate); - if (did_error) { - return -1; - } - } - - if (base == NPY_FR_D) { - return days; - } - - if (base == NPY_FR_W) { - int64_t weeks; - PD_CHECK_OVERFLOW(scaleDaysToWeeks(days, &weeks)); - return weeks; - } - - int64_t hours; - PD_CHECK_OVERFLOW(scaleDaysToHours(days, &hours)); - PD_CHECK_OVERFLOW(checked_int64_add(hours, dts->hour, &hours)); - - if (base == NPY_FR_h) { - return hours; - } - - int64_t minutes; - PD_CHECK_OVERFLOW(scaleHoursToMinutes(hours, &minutes)); - PD_CHECK_OVERFLOW(checked_int64_add(minutes, dts->min, &minutes)); - - if (base == NPY_FR_m) { - return minutes; - } - - int64_t seconds; - PD_CHECK_OVERFLOW(scaleMinutesToSeconds(minutes, &seconds)); - PD_CHECK_OVERFLOW(checked_int64_add(seconds, dts->sec, &seconds)); - - if (base == NPY_FR_s) { - return seconds; - } - - if (base == NPY_FR_ms) { - int64_t milliseconds; - PD_CHECK_OVERFLOW(scaleSecondsToMilliseconds(seconds, &milliseconds)); - PD_CHECK_OVERFLOW( - checked_int64_add(milliseconds, dts->us / 1000, &milliseconds)); - - return milliseconds; - } - - int64_t microseconds; - PD_CHECK_OVERFLOW(scaleSecondsToMicroseconds(seconds, µseconds)); - PD_CHECK_OVERFLOW(checked_int64_add(microseconds, dts->us, µseconds)); - - if (base == NPY_FR_us) { - return microseconds; - } - - if (base == NPY_FR_ns) { - int64_t nanoseconds; - - // Minimum valid timestamp in nanoseconds (1677-09-21 00:12:43.145224193). - const int64_t min_nanoseconds = NPY_MIN_INT64 + 1; - if (microseconds == min_nanoseconds / 1000 - 1) { - // For values within one microsecond of min_nanoseconds, use it as base - // and offset it with nanosecond delta to avoid overflow during scaling. - PD_CHECK_OVERFLOW(checked_int64_add( - min_nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); - } else { - PD_CHECK_OVERFLOW( - scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); - PD_CHECK_OVERFLOW( - checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); - } - - return nanoseconds; - } - - int64_t picoseconds; - PD_CHECK_OVERFLOW(scaleMicrosecondsToPicoseconds(microseconds, &picoseconds)); - PD_CHECK_OVERFLOW(checked_int64_add(picoseconds, dts->ps, &picoseconds)); - - if (base == NPY_FR_ps) { - return picoseconds; - } - - if (base == NPY_FR_fs) { - int64_t femtoseconds; - PD_CHECK_OVERFLOW( - scalePicosecondsToFemtoseconds(picoseconds, &femtoseconds)); - PD_CHECK_OVERFLOW( - checked_int64_add(femtoseconds, dts->as / 1000, &femtoseconds)); - return femtoseconds; - } - - if (base == NPY_FR_as) { - int64_t attoseconds; - PD_CHECK_OVERFLOW(scalePicosecondsToAttoseconds(picoseconds, &attoseconds)); - PD_CHECK_OVERFLOW(checked_int64_add(attoseconds, dts->as, &attoseconds)); - return attoseconds; - } - - /* Something got corrupted */ - PyGILState_STATE gstate = PyGILState_Ensure(); - PyErr_SetString(PyExc_ValueError, - "NumPy datetime metadata with corrupt unit value"); - PyGILState_Release(gstate); - - return -1; -} - -/* - * Port numpy#13188 https://github.com/numpy/numpy/pull/13188/ - * - * Computes the python `ret, d = divmod(d, unit)`. - * - * Note that GCC is smart enough at -O2 to eliminate the `if(*d < 0)` branch - * for subsequent calls to this command - it is able to deduce that `*d >= 0`. - */ -npy_int64 extract_unit(npy_datetime *d, npy_datetime unit) { - assert(unit > 0); - npy_int64 div = *d / unit; - npy_int64 mod = *d % unit; - if (mod < 0) { - mod += unit; - div -= 1; - } - assert(mod >= 0); - *d = mod; - return div; -} - -/* - * Converts a datetime based on the given metadata into a datetimestruct - */ -void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, - npy_datetimestruct *out) { - npy_int64 perday; - - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); - out->year = 1970; - out->month = 1; - out->day = 1; - - /* - * Note that care must be taken with the / and % operators - * for negative values. - */ - switch (base) { - case NPY_FR_Y: - out->year = 1970 + dt; - break; - - case NPY_FR_M: - out->year = 1970 + extract_unit(&dt, 12); - out->month = (npy_int32)dt + 1; - break; - - case NPY_FR_W: - /* A week is 7 days */ - set_datetimestruct_days(dt * 7, out); - break; - - case NPY_FR_D: - set_datetimestruct_days(dt, out); - break; - - case NPY_FR_h: - perday = 24LL; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (npy_int32)dt; - break; - - case NPY_FR_m: - perday = 24LL * 60; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (npy_int32)extract_unit(&dt, 60); - out->min = (npy_int32)dt; - break; - - case NPY_FR_s: - perday = 24LL * 60 * 60; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (npy_int32)extract_unit(&dt, 60 * 60); - out->min = (npy_int32)extract_unit(&dt, 60); - out->sec = (npy_int32)dt; - break; - - case NPY_FR_ms: - perday = 24LL * 60 * 60 * 1000; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (npy_int32)extract_unit(&dt, 1000LL * 60 * 60); - out->min = (npy_int32)extract_unit(&dt, 1000LL * 60); - out->sec = (npy_int32)extract_unit(&dt, 1000LL); - out->us = (npy_int32)(dt * 1000); - break; - - case NPY_FR_us: - perday = 24LL * 60LL * 60LL * 1000LL * 1000LL; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 60 * 60); - out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 60); - out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000); - out->us = (npy_int32)dt; - break; - - case NPY_FR_ns: - perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (npy_int32)extract_unit(&dt, 1000LL); - out->ps = (npy_int32)(dt * 1000); - break; - - case NPY_FR_ps: - perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = - (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 60 * 60); - out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 60); - out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); - out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000); - out->ps = (npy_int32)(dt); - break; - - case NPY_FR_fs: - /* entire range is only +- 2.6 hours */ - out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * - 1000 * 60 * 60); - if (out->hour < 0) { - out->year = 1969; - out->month = 12; - out->day = 31; - out->hour += 24; - assert(out->hour >= 0); - } - out->min = - (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 60); - out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000); - out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); - out->ps = (npy_int32)extract_unit(&dt, 1000LL); - out->as = (npy_int32)(dt * 1000); - break; - - case NPY_FR_as: - /* entire range is only +- 9.2 seconds */ - out->sec = - (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 1000); - if (out->sec < 0) { - out->year = 1969; - out->month = 12; - out->day = 31; - out->hour = 23; - out->min = 59; - out->sec += 60; - assert(out->sec >= 0); - } - out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); - out->ps = (npy_int32)extract_unit(&dt, 1000LL * 1000); - out->as = (npy_int32)dt; - break; - - default: - PyErr_SetString(PyExc_RuntimeError, - "NumPy datetime metadata is corrupted with invalid " - "base unit"); - } -} - -/* - * Converts a timedelta from a timedeltastruct to a timedelta based - * on a metadata unit. The timedelta is assumed to be valid. - * - * Returns 0 on success, -1 on failure. - */ -void pandas_timedelta_to_timedeltastruct(npy_timedelta td, - NPY_DATETIMEUNIT base, - pandas_timedeltastruct *out) { - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(pandas_timedeltastruct)); - - const npy_int64 sec_per_hour = 3600; - const npy_int64 sec_per_min = 60; - - switch (base) { - case NPY_FR_W: - out->days = 7 * td; - break; - case NPY_FR_D: - out->days = td; - break; - case NPY_FR_h: - out->days = td / 24LL; - td -= out->days * 24LL; - out->hrs = (npy_int32)td; - break; - case NPY_FR_m: - out->days = td / 1440LL; - td -= out->days * 1440LL; - out->hrs = (npy_int32)(td / 60LL); - td -= out->hrs * 60LL; - out->min = (npy_int32)td; - break; - case NPY_FR_s: - case NPY_FR_ms: - case NPY_FR_us: - case NPY_FR_ns: { - const npy_int64 sec_per_day = 86400; - npy_int64 per_sec; - if (base == NPY_FR_s) { - per_sec = 1; - } else if (base == NPY_FR_ms) { - per_sec = 1000; - } else if (base == NPY_FR_us) { - per_sec = 1000000; - } else { - per_sec = 1000000000; - } - - const npy_int64 per_day = sec_per_day * per_sec; - npy_int64 frac; - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - const int sign = frac < 0 ? -1 : 1; - if (frac < 0) { - // even fraction - if ((-frac % sec_per_day) != 0) { - out->days = -frac / sec_per_day + 1; - frac += sec_per_day * out->days; - } else { - frac = -frac; - } - } - - if (frac >= sec_per_day) { - out->days += frac / sec_per_day; - frac -= out->days * sec_per_day; - } - - if (frac >= sec_per_hour) { - out->hrs = (npy_int32)(frac / sec_per_hour); - frac -= out->hrs * sec_per_hour; - } - - if (frac >= sec_per_min) { - out->min = (npy_int32)(frac / sec_per_min); - frac -= out->min * sec_per_min; - } - - if (frac >= 0) { - out->sec = (npy_int32)frac; - frac -= out->sec; - } - - if (sign < 0) - out->days = -out->days; - - if (base > NPY_FR_s) { - const npy_int64 sfrac = - (out->hrs * sec_per_hour + out->min * sec_per_min + out->sec) * - per_sec; - - npy_int64 ifrac = td - (out->days * per_day + sfrac); - - if (base == NPY_FR_ms) { - out->ms = (npy_int32)ifrac; - } else if (base == NPY_FR_us) { - out->ms = (npy_int32)(ifrac / 1000LL); - ifrac = ifrac % 1000LL; - out->us = (npy_int32)ifrac; - } else if (base == NPY_FR_ns) { - out->ms = (npy_int32)(ifrac / (1000LL * 1000LL)); - ifrac = ifrac % (1000LL * 1000LL); - out->us = (npy_int32)(ifrac / 1000LL); - ifrac = ifrac % 1000LL; - out->ns = (npy_int32)ifrac; - } - } - - } break; - default: - PyErr_SetString(PyExc_RuntimeError, - "NumPy timedelta metadata is corrupted with " - "invalid base unit"); - break; - } - - out->seconds = - (npy_int32)(out->hrs * sec_per_hour + out->min * sec_per_min + out->sec); - out->microseconds = out->ms * 1000 + out->us; - out->nanoseconds = out->ns; -} - -/* - * This function returns a pointer to the DateTimeMetaData - * contained within the provided datetime dtype. - * - * Copied near-verbatim from numpy/core/src/multiarray/datetime.c - */ -PyArray_DatetimeMetaData -get_datetime_metadata_from_dtype(PyArray_Descr *dtype) { -#if NPY_ABI_VERSION < 0x02000000 -#define PyDataType_C_METADATA(dtype) ((dtype)->c_metadata) -#endif - return ((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(dtype))->meta; -} diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c deleted file mode 100644 index a46f5bc467c5d..0000000000000 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ /dev/null @@ -1,1168 +0,0 @@ -/* - -Copyright (c) 2016, PyData Development Team -All rights reserved. - -Distributed under the terms of the BSD Simplified License. - -The full license is in the LICENSE file, distributed with this software. - -Written by Mark Wiebe (mwwiebe@gmail.com) -Copyright (c) 2011 by Enthought, Inc. - -Copyright (c) 2005-2011, NumPy Developers -All rights reserved. - -See NUMPY_LICENSE.txt for the license. - -This file implements string parsing and creation for NumPy datetime. - -*/ - -// LICENSES/NUMPY_LICENSE - -#define PY_SSIZE_T_CLEAN -#define NO_IMPORT - -#ifndef NPY_NO_DEPRECATED_API -#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#endif // NPY_NO_DEPRECATED_API - -#include - -#include - -#include -#include - -#include "pandas/portable.h" -#include "pandas/vendored/numpy/datetime/np_datetime.h" -#include "pandas/vendored/numpy/datetime/np_datetime_strings.h" - -/* - * Parses (almost) standard ISO 8601 date strings. The differences are: - * - * + Only seconds may have a decimal point, with up to 18 digits after it - * (maximum attoseconds precision). - * + Either a 'T' as in ISO 8601 or a ' ' may be used to separate - * the date and the time. Both are treated equivalently. - * + Doesn't (yet) handle the "YYYY-DDD" or "YYYY-Www" formats. - * + Doesn't handle leap seconds (seconds value has 60 in these cases). - * + Doesn't handle 24:00:00 as synonym for midnight (00:00:00) tomorrow - * + Accepts special values "NaT" (not a time), "Today", (current - * day according to local time) and "Now" (current time in UTC). - * + ':' separator between hours, minutes, and seconds is optional. When - * omitted, each component must be 2 digits if it appears. (GH-10041) - * - * 'str' must be a NULL-terminated string, and 'len' must be its length. - * - * 'out' gets filled with the parsed date-time. - * 'out_local' gets set to 1 if the parsed time contains timezone, - * to 0 otherwise. - * 'out_tzoffset' gets set to timezone offset by minutes - * if the parsed time was in local time, - * to 0 otherwise. The values 'now' and 'today' don't get counted - * as local, and neither do UTC +/-#### timezone offsets, because - * they aren't using the computer's local timezone offset. - * - * Returns 0 on success, -1 on failure. - */ - -typedef enum { - COMPARISON_SUCCESS, - COMPLETED_PARTIAL_MATCH, - COMPARISON_ERROR -} DatetimePartParseResult; -// This function will advance the pointer on format -// and decrement characters_remaining by n on success -// On failure will return COMPARISON_ERROR without incrementing -// If `format_requirement` is PARTIAL_MATCH, and the `format` string has -// been exhausted, then return COMPLETED_PARTIAL_MATCH. -static DatetimePartParseResult -compare_format(const char **format, int *characters_remaining, - const char *compare_to, int n, - const FormatRequirement format_requirement) { - if (format_requirement == INFER_FORMAT) { - return COMPARISON_SUCCESS; - } - if (*characters_remaining < 0) { - return COMPARISON_ERROR; - } - if (format_requirement == PARTIAL_MATCH && *characters_remaining == 0) { - return COMPLETED_PARTIAL_MATCH; - } - if (*characters_remaining < n) { - // TODO(pandas-dev): PyErr to differentiate what went wrong - return COMPARISON_ERROR; - } else { - if (strncmp(*format, compare_to, n)) { - // TODO(pandas-dev): PyErr to differentiate what went wrong - return COMPARISON_ERROR; - } else { - *format += n; - *characters_remaining -= n; - return COMPARISON_SUCCESS; - } - } - return COMPARISON_SUCCESS; -} - -int parse_iso_8601_datetime(const char *str, int len, int want_exc, - npy_datetimestruct *out, - NPY_DATETIMEUNIT *out_bestunit, int *out_local, - int *out_tzoffset, const char *format, - int format_len, - FormatRequirement format_requirement) { - if (len < 0 || format_len < 0) - goto parse_error; - int year_leap = 0; - int i, numdigits; - const char *substr; - int sublen; - NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC; - DatetimePartParseResult comparison; - - /* If year-month-day are separated by a valid separator, - * months/days without leading zeroes will be parsed - * (though not iso8601). If the components aren't separated, - * 4 (YYYY) or 8 (YYYYMMDD) digits are expected. 6 digits are - * forbidden here (but parsed as YYMMDD elsewhere). - */ - int has_ymd_sep = 0; - char ymd_sep = '\0'; - char valid_ymd_sep[] = {'-', '.', '/', '\\', ' '}; - int valid_ymd_sep_len = sizeof(valid_ymd_sep); - - /* hour-minute-second may or may not separated by ':'. If not, then - * each component must be 2 digits. */ - int has_hms_sep = 0; - int hour_was_2_digits = 0; - - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); - out->month = 1; - out->day = 1; - - substr = str; - sublen = len; - - /* Skip leading whitespace */ - while (sublen > 0 && isspace(*substr)) { - ++substr; - --sublen; - comparison = - compare_format(&format, &format_len, " ", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - } - - /* Leading '-' sign for negative year */ - if (*substr == '-') { - ++substr; - --sublen; - } - - if (sublen == 0) { - goto parse_error; - } - - /* PARSE THE YEAR (4 digits) */ - comparison = - compare_format(&format, &format_len, "%Y", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - - out->year = 0; - if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && - isdigit(substr[2]) && isdigit(substr[3])) { - out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') + - 10 * (substr[2] - '0') + (substr[3] - '0'); - - substr += 4; - sublen -= 4; - } - - /* Negate the year if necessary */ - if (str[0] == '-') { - out->year = -out->year; - } - /* Check whether it's a leap-year */ - year_leap = is_leapyear(out->year); - - /* Next character must be a separator, start of month, or end of string */ - if (sublen == 0) { - if (out_local != NULL) { - *out_local = 0; - } - if (format_len) { - goto parse_error; - } - bestunit = NPY_FR_Y; - goto finish; - } - - if (!isdigit(*substr)) { - for (i = 0; i < valid_ymd_sep_len; ++i) { - if (*substr == valid_ymd_sep[i]) { - break; - } - } - if (i == valid_ymd_sep_len) { - goto parse_error; - } - has_ymd_sep = 1; - ymd_sep = valid_ymd_sep[i]; - ++substr; - --sublen; - - comparison = - compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* Cannot have trailing separator */ - if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; - } - } - - /* PARSE THE MONTH */ - comparison = - compare_format(&format, &format_len, "%m", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* First digit required */ - out->month = (*substr - '0'); - ++substr; - --sublen; - /* Second digit optional if there was a separator */ - if (isdigit(*substr)) { - out->month = 10 * out->month + (*substr - '0'); - ++substr; - --sublen; - } else if (!has_ymd_sep) { - goto parse_error; - } - if (out->month < 1 || out->month > 12) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Month out of range in datetime string \"%s\"", str); - } - goto error; - } - - /* Next character must be the separator, start of day, or end of string */ - if (sublen == 0) { - bestunit = NPY_FR_M; - /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */ - if (!has_ymd_sep) { - goto parse_error; - } - if (format_len) { - goto parse_error; - } - if (out_local != NULL) { - *out_local = 0; - } - goto finish; - } - - if (has_ymd_sep) { - /* Must have separator, but cannot be trailing */ - if (*substr != ymd_sep || sublen == 1) { - goto parse_error; - } - ++substr; - --sublen; - comparison = - compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - } - - /* PARSE THE DAY */ - comparison = - compare_format(&format, &format_len, "%d", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* First digit required */ - if (!isdigit(*substr)) { - goto parse_error; - } - out->day = (*substr - '0'); - ++substr; - --sublen; - /* Second digit optional if there was a separator */ - if (isdigit(*substr)) { - out->day = 10 * out->day + (*substr - '0'); - ++substr; - --sublen; - } else if (!has_ymd_sep) { - goto parse_error; - } - if (out->day < 1 || - out->day > days_per_month_table[year_leap][out->month - 1]) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Day out of range in datetime string \"%s\"", str); - } - goto error; - } - - /* Next character must be a 'T', ' ', or end of string */ - if (sublen == 0) { - if (out_local != NULL) { - *out_local = 0; - } - if (format_len) { - goto parse_error; - } - bestunit = NPY_FR_D; - goto finish; - } - - if ((*substr != 'T' && *substr != ' ') || sublen == 1) { - goto parse_error; - } - comparison = - compare_format(&format, &format_len, substr, 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - ++substr; - --sublen; - - /* PARSE THE HOURS */ - comparison = - compare_format(&format, &format_len, "%H", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* First digit required */ - if (!isdigit(*substr)) { - goto parse_error; - } - out->hour = (*substr - '0'); - bestunit = NPY_FR_h; - ++substr; - --sublen; - /* Second digit optional */ - if (isdigit(*substr)) { - hour_was_2_digits = 1; - out->hour = 10 * out->hour + (*substr - '0'); - ++substr; - --sublen; - if (out->hour >= 24) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Hours out of range in datetime string \"%s\"", str); - } - goto error; - } - } - - /* Next character must be a ':' or the end of the string */ - if (sublen == 0) { - if (!hour_was_2_digits) { - goto parse_error; - } - if (format_len) { - goto parse_error; - } - bestunit = NPY_FR_h; - goto finish; - } - - if (*substr == ':') { - has_hms_sep = 1; - ++substr; - --sublen; - /* Cannot have a trailing separator */ - if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; - } - comparison = - compare_format(&format, &format_len, ":", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - } else if (!isdigit(*substr)) { - if (!hour_was_2_digits) { - goto parse_error; - } - goto parse_timezone; - } - - /* PARSE THE MINUTES */ - comparison = - compare_format(&format, &format_len, "%M", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* First digit required */ - out->min = (*substr - '0'); - bestunit = NPY_FR_m; - ++substr; - --sublen; - /* Second digit optional if there was a separator */ - if (isdigit(*substr)) { - out->min = 10 * out->min + (*substr - '0'); - ++substr; - --sublen; - if (out->min >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Minutes out of range in datetime string \"%s\"", str); - } - goto error; - } - } else if (!has_hms_sep) { - goto parse_error; - } - - if (sublen == 0) { - bestunit = NPY_FR_m; - if (format_len) { - goto parse_error; - } - goto finish; - } - - /* If we make it through this condition block, then the next - * character is a digit. */ - if (has_hms_sep && *substr == ':') { - comparison = - compare_format(&format, &format_len, ":", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - ++substr; - --sublen; - /* Cannot have a trailing ':' */ - if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; - } - } else if (!has_hms_sep && isdigit(*substr)) { - } else { - goto parse_timezone; - } - - /* PARSE THE SECONDS */ - comparison = - compare_format(&format, &format_len, "%S", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* First digit required */ - out->sec = (*substr - '0'); - ++substr; - --sublen; - /* Second digit optional if there was a separator */ - if (isdigit(*substr)) { - out->sec = 10 * out->sec + (*substr - '0'); - ++substr; - --sublen; - if (out->sec >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Seconds out of range in datetime string \"%s\"", str); - } - goto error; - } - } else if (!has_hms_sep) { - goto parse_error; - } - - /* Next character may be a '.' indicating fractional seconds */ - if (sublen > 0 && *substr == '.') { - ++substr; - --sublen; - comparison = - compare_format(&format, &format_len, ".", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - } else { - bestunit = NPY_FR_s; - goto parse_timezone; - } - - /* PARSE THE MICROSECONDS (0 to 6 digits) */ - comparison = - compare_format(&format, &format_len, "%f", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - numdigits = 0; - for (i = 0; i < 6; ++i) { - out->us *= 10; - if (sublen > 0 && isdigit(*substr)) { - out->us += (*substr - '0'); - ++substr; - --sublen; - ++numdigits; - } - } - - if (sublen == 0 || !isdigit(*substr)) { - if (numdigits > 3) { - bestunit = NPY_FR_us; - } else { - bestunit = NPY_FR_ms; - } - goto parse_timezone; - } - - /* PARSE THE PICOSECONDS (0 to 6 digits) */ - numdigits = 0; - for (i = 0; i < 6; ++i) { - out->ps *= 10; - if (sublen > 0 && isdigit(*substr)) { - out->ps += (*substr - '0'); - ++substr; - --sublen; - ++numdigits; - } - } - - if (sublen == 0 || !isdigit(*substr)) { - if (numdigits > 3) { - bestunit = NPY_FR_ps; - } else { - bestunit = NPY_FR_ns; - } - goto parse_timezone; - } - - /* PARSE THE ATTOSECONDS (0 to 6 digits) */ - numdigits = 0; - for (i = 0; i < 6; ++i) { - out->as *= 10; - if (sublen > 0 && isdigit(*substr)) { - out->as += (*substr - '0'); - ++substr; - --sublen; - ++numdigits; - } - } - - if (numdigits > 3) { - bestunit = NPY_FR_as; - } else { - bestunit = NPY_FR_fs; - } - -parse_timezone: - /* trim any whitespace between time/timezone */ - while (sublen > 0 && isspace(*substr)) { - ++substr; - --sublen; - comparison = - compare_format(&format, &format_len, " ", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - } - - if (sublen == 0) { - // Unlike NumPy, treating no time zone as naive - if (format_len > 0) { - goto parse_error; - } - goto finish; - } - - /* UTC specifier */ - if (*substr == 'Z') { - comparison = - compare_format(&format, &format_len, "%z", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* "Z" should be equivalent to tz offset "+00:00" */ - if (out_local != NULL) { - *out_local = 1; - } - - if (out_tzoffset != NULL) { - *out_tzoffset = 0; - } - - if (sublen == 1) { - if (format_len > 0) { - goto parse_error; - } - goto finish; - } else { - ++substr; - --sublen; - } - } else if (*substr == '-' || *substr == '+') { - comparison = - compare_format(&format, &format_len, "%z", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* Time zone offset */ - int offset_neg = 0, offset_hour = 0, offset_minute = 0; - - /* - * Since "local" means local with respect to the current - * machine, we say this is non-local. - */ - - if (*substr == '-') { - offset_neg = 1; - } - ++substr; - --sublen; - - /* The hours offset */ - if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { - offset_hour = 10 * (substr[0] - '0') + (substr[1] - '0'); - substr += 2; - sublen -= 2; - if (offset_hour >= 24) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Timezone hours offset out of range " - "in datetime string \"%s\"", - str); - } - goto error; - } - } else if (sublen >= 1 && isdigit(substr[0])) { - offset_hour = substr[0] - '0'; - ++substr; - --sublen; - } else { - goto parse_error; - } - - /* The minutes offset is optional */ - if (sublen > 0) { - /* Optional ':' */ - if (*substr == ':') { - ++substr; - --sublen; - } - - /* The minutes offset (at the end of the string) */ - if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { - offset_minute = 10 * (substr[0] - '0') + (substr[1] - '0'); - substr += 2; - sublen -= 2; - if (offset_minute >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Timezone minutes offset out of range " - "in datetime string \"%s\"", - str); - } - goto error; - } - } else if (sublen >= 1 && isdigit(substr[0])) { - offset_minute = substr[0] - '0'; - ++substr; - --sublen; - } else { - goto parse_error; - } - } - - /* Apply the time zone offset */ - if (offset_neg) { - offset_hour = -offset_hour; - offset_minute = -offset_minute; - } - if (out_local != NULL) { - *out_local = 1; - // Unlike NumPy, do not change internal value to local time - *out_tzoffset = 60 * offset_hour + offset_minute; - } - } - - /* Skip trailing whitespace */ - while (sublen > 0 && isspace(*substr)) { - ++substr; - --sublen; - comparison = - compare_format(&format, &format_len, " ", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - } - - if ((sublen != 0) || (format_len != 0)) { - goto parse_error; - } - -finish: - if (out_bestunit != NULL) { - *out_bestunit = bestunit; - } - return 0; - -parse_error: - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Error parsing datetime string \"%s\" at position %d", str, - (int)(substr - str)); - } - return -1; - -error: - return -1; -} - -/* - * Provides a string length to use for converting datetime - * objects with the given local and unit settings. - */ -int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { - int len = 0; - - switch (base) { - /* Generic units can only be used to represent NaT */ - /* return 4;*/ - case NPY_FR_as: - len += 3; /* "###" */ - PD_FALLTHROUGH; - case NPY_FR_fs: - len += 3; /* "###" */ - PD_FALLTHROUGH; - case NPY_FR_ps: - len += 3; /* "###" */ - PD_FALLTHROUGH; - case NPY_FR_ns: - len += 3; /* "###" */ - PD_FALLTHROUGH; - case NPY_FR_us: - len += 3; /* "###" */ - PD_FALLTHROUGH; - case NPY_FR_ms: - len += 4; /* ".###" */ - PD_FALLTHROUGH; - case NPY_FR_s: - len += 3; /* ":##" */ - PD_FALLTHROUGH; - case NPY_FR_m: - len += 3; /* ":##" */ - PD_FALLTHROUGH; - case NPY_FR_h: - len += 3; /* "T##" */ - PD_FALLTHROUGH; - case NPY_FR_D: - case NPY_FR_W: - len += 3; /* "-##" */ - PD_FALLTHROUGH; - case NPY_FR_M: - len += 3; /* "-##" */ - PD_FALLTHROUGH; - case NPY_FR_Y: - len += 21; /* 64-bit year */ - break; - default: - len += 3; /* handle the now defunct NPY_FR_B */ - break; - } - - if (base >= NPY_FR_h) { - if (local) { - len += 5; /* "+####" or "-####" */ - } else { - len += 1; /* "Z" */ - } - } - - len += 1; /* NULL terminator */ - - return len; -} - -/* - * Converts an npy_datetimestruct to an (almost) ISO 8601 - * NULL-terminated string using timezone Z (UTC). If the string fits in - * the space exactly, it leaves out the NULL terminator and returns success. - * - * The differences from ISO 8601 are the 'NaT' string, and - * the number of year digits is >= 4 instead of strictly 4. - * - * 'base' restricts the output to that unit. Set 'base' to - * -1 to auto-detect a base after which all the values are zero. - * - * Returns 0 on success, -1 on failure (for example if the output - * string was too short). - */ -int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, size_t outlen, - int utc, NPY_DATETIMEUNIT base) { - char *substr = outstr; - size_t sublen = outlen; - int tmplen; - - /* - * Print weeks with the same precision as days. - * - * TODO: Could print weeks with YYYY-Www format if the week - * epoch is a Monday. - */ - if (base == NPY_FR_W) { - base = NPY_FR_D; - } - -/* YEAR */ -/* - * Can't use PyOS_snprintf, because it always produces a '\0' - * character at the end, and NumPy string types are permitted - * to have data all the way to the end of the buffer. - */ -#ifdef _WIN32 - tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); -#else - tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); -#endif // _WIN32 - /* If it ran out of space or there isn't space for the NULL terminator */ - if (tmplen < 0 || (size_t)tmplen > sublen) { - goto string_too_short; - } - substr += tmplen; - sublen -= tmplen; - - /* Stop if the unit is years */ - if (base == NPY_FR_Y) { - if (sublen > 0) { - *substr = '\0'; - } - return 0; - } - - /* MONTH */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = '-'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->month / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->month % 10) + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is months */ - if (base == NPY_FR_M) { - if (sublen > 0) { - *substr = '\0'; - } - return 0; - } - - /* DAY */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = '-'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->day / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->day % 10) + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is days */ - if (base == NPY_FR_D) { - if (sublen > 0) { - *substr = '\0'; - } - return 0; - } - - /* HOUR */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = 'T'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->hour / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->hour % 10) + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is hours */ - if (base == NPY_FR_h) { - goto add_time_zone; - } - - /* MINUTE */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = ':'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->min / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->min % 10) + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is minutes */ - if (base == NPY_FR_m) { - goto add_time_zone; - } - - /* SECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = ':'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->sec / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->sec % 10) + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is seconds */ - if (base == NPY_FR_s) { - goto add_time_zone; - } - - /* MILLISECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = '.'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->us / 100000) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->us / 10000) % 10 + '0'); - if (sublen < 4) { - goto string_too_short; - } - substr[3] = (char)((dts->us / 1000) % 10 + '0'); - substr += 4; - sublen -= 4; - - /* Stop if the unit is milliseconds */ - if (base == NPY_FR_ms) { - goto add_time_zone; - } - - /* MICROSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->us / 100) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->us / 10) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)(dts->us % 10 + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is microseconds */ - if (base == NPY_FR_us) { - goto add_time_zone; - } - - /* NANOSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->ps / 100000) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->ps / 10000) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->ps / 1000) % 10 + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is nanoseconds */ - if (base == NPY_FR_ns) { - goto add_time_zone; - } - - /* PICOSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->ps / 100) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->ps / 10) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)(dts->ps % 10 + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is picoseconds */ - if (base == NPY_FR_ps) { - goto add_time_zone; - } - - /* FEMTOSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->as / 100000) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->as / 10000) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->as / 1000) % 10 + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is femtoseconds */ - if (base == NPY_FR_fs) { - goto add_time_zone; - } - - /* ATTOSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->as / 100) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->as / 10) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)(dts->as % 10 + '0'); - substr += 3; - sublen -= 3; - -add_time_zone: - /* UTC "Zulu" time */ - if (utc) { - if (sublen < 1) { - goto string_too_short; - } - substr[0] = 'Z'; - substr += 1; - sublen -= 1; - } - /* Add a NULL terminator, and return */ - if (sublen > 0) { - substr[0] = '\0'; - } - - return 0; - -string_too_short: - PyErr_Format(PyExc_RuntimeError, - "The string provided for NumPy ISO datetime formatting " - "was too short, with length %d", - outlen); - return -1; -} - -int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, - size_t *outlen) { - *outlen = 0; - *outlen += snprintf(outstr, 60, // NOLINT - "P%" NPY_INT64_FMT "DT%" NPY_INT32_FMT "H%" NPY_INT32_FMT - "M%" NPY_INT32_FMT, - tds->days, tds->hrs, tds->min, tds->sec); - outstr += *outlen; - - if (tds->ns != 0) { - *outlen += snprintf(outstr, 12, // NOLINT - ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT - "%03" NPY_INT32_FMT "S", - tds->ms, tds->us, tds->ns); - } else if (tds->us != 0) { - *outlen += snprintf(outstr, 9, // NOLINT - ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, - tds->us); - } else if (tds->ms != 0) { - *outlen += snprintf(outstr, 6, // NOLINT - ".%03" NPY_INT32_FMT "S", tds->ms); - } else { - *outlen += snprintf(outstr, 2, // NOLINT - "%s", "S"); - } - - return 0; -} diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c deleted file mode 100644 index bf389b4dce1d0..0000000000000 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c +++ /dev/null @@ -1,1221 +0,0 @@ -/* -Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: -* Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. -* Neither the name of the ESN Social Software AB nor the -names of its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE -LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights -reserved. - -Numeric decoder derived from TCL library -https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms -* Copyright (c) 1988-1993 The Regents of the University of California. -* Copyright (c) 1994 Sun Microsystems, Inc. -*/ - -// Licence at LICENSES/ULTRAJSON_LICENSE - -#include "pandas/vendored/ujson/lib/ultrajson.h" -#include -#include -#include -#include -#include -#include -#include - -#ifndef TRUE -#define TRUE 1 -#define FALSE 0 -#endif -#ifndef NULL -#define NULL 0 -#endif - -struct DecoderState { - char *start; - char *end; - wchar_t *escStart; - wchar_t *escEnd; - int escHeap; - int lastType; - JSUINT32 objDepth; - void *prv; - JSONObjectDecoder *dec; -}; - -JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds); -typedef JSOBJ (*PFN_DECODER)(struct DecoderState *ds); - -static JSOBJ SetError(struct DecoderState *ds, int offset, - const char *message) { - ds->dec->errorOffset = ds->start + offset; - ds->dec->errorStr = (char *)message; - return NULL; -} - -double createDouble(double intNeg, double intValue, double frcValue, - int frcDecimalCount) { - static const double g_pow10[] = {1.0, - 0.1, - 0.01, - 0.001, - 0.0001, - 0.00001, - 0.000001, - 0.0000001, - 0.00000001, - 0.000000001, - 0.0000000001, - 0.00000000001, - 0.000000000001, - 0.0000000000001, - 0.00000000000001, - 0.000000000000001}; - return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg; -} - -JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) { - char *end; - double value; - errno = 0; - - value = strtod(ds->start, &end); - - if (errno == ERANGE) { - return SetError(ds, -1, "Range error when decoding numeric as double"); - } - - ds->start = end; - return ds->dec->newDouble(ds->prv, value); -} - -JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { - int intNeg = 1; - JSUINT64 intValue; - JSUINT64 prevIntValue; - int chr; - int decimalCount = 0; - double frcValue = 0.0; - double expNeg; - double expValue; - char *offset = ds->start; - - JSUINT64 overflowLimit = LLONG_MAX; - - if (*(offset) == 'I') { - goto DECODE_INF; - } else if (*(offset) == 'N') { - goto DECODE_NAN; - } else if (*(offset) == '-') { - offset++; - intNeg = -1; - overflowLimit = LLONG_MIN; - if (*(offset) == 'I') { - goto DECODE_INF; - } - } - - // Scan integer part - intValue = 0; - - while (1) { - chr = (int)(unsigned char)*(offset); - - switch (chr) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - // PERF: Don't do 64-bit arithmetic here unless we have to - prevIntValue = intValue; - intValue = intValue * 10ULL + (JSLONG)(chr - 48); - - if (intNeg == 1 && prevIntValue > intValue) { - return SetError(ds, -1, "Value is too big!"); - } else if (intNeg == -1 && intValue > overflowLimit) { - return SetError(ds, -1, - overflowLimit == LLONG_MAX ? "Value is too big!" - : "Value is too small"); - } - - offset++; - break; - } - case '.': { - offset++; - goto DECODE_FRACTION; - break; - } - case 'e': - case 'E': { - offset++; - goto DECODE_EXPONENT; - break; - } - - default: { - goto BREAK_INT_LOOP; - break; - } - } - } - -BREAK_INT_LOOP: - - ds->lastType = JT_INT; - ds->start = offset; - - if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0) - return ds->dec->newUnsignedLong(ds->prv, intValue); - else if ((intValue >> 31)) - return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg)); - else - return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg)); - -DECODE_FRACTION: - - if (ds->dec->preciseFloat) { - return decodePreciseFloat(ds); - } - - // Scan fraction part - frcValue = 0.0; - for (;;) { - chr = (int)(unsigned char)*(offset); - - switch (chr) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) { - frcValue = frcValue * 10.0 + (double)(chr - 48); - decimalCount++; - } - offset++; - break; - } - case 'e': - case 'E': { - offset++; - goto DECODE_EXPONENT; - break; - } - default: { - goto BREAK_FRC_LOOP; - } - } - } - -BREAK_FRC_LOOP: - // FIXME: Check for arithmetic overflow here - ds->lastType = JT_DOUBLE; - ds->start = offset; - return ds->dec->newDouble( - ds->prv, - createDouble((double)intNeg, (double)intValue, frcValue, decimalCount)); - -DECODE_EXPONENT: - if (ds->dec->preciseFloat) { - return decodePreciseFloat(ds); - } - - expNeg = 1.0; - - if (*(offset) == '-') { - expNeg = -1.0; - offset++; - } else if (*(offset) == '+') { - expNeg = +1.0; - offset++; - } - - expValue = 0.0; - - for (;;) { - chr = (int)(unsigned char)*(offset); - - switch (chr) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - expValue = expValue * 10.0 + (double)(chr - 48); - offset++; - break; - } - default: { - goto BREAK_EXP_LOOP; - } - } - } - -DECODE_NAN: - offset++; - if (*(offset++) != 'a') - goto SET_NAN_ERROR; - if (*(offset++) != 'N') - goto SET_NAN_ERROR; - - ds->lastType = JT_NULL; - ds->start = offset; - return ds->dec->newNull(ds->prv); - -SET_NAN_ERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'NaN'"); - -DECODE_INF: - offset++; - if (*(offset++) != 'n') - goto SET_INF_ERROR; - if (*(offset++) != 'f') - goto SET_INF_ERROR; - if (*(offset++) != 'i') - goto SET_INF_ERROR; - if (*(offset++) != 'n') - goto SET_INF_ERROR; - if (*(offset++) != 'i') - goto SET_INF_ERROR; - if (*(offset++) != 't') - goto SET_INF_ERROR; - if (*(offset++) != 'y') - goto SET_INF_ERROR; - - ds->start = offset; - - if (intNeg == 1) { - ds->lastType = JT_POS_INF; - return ds->dec->newPosInf(ds->prv); - } else { - ds->lastType = JT_NEG_INF; - return ds->dec->newNegInf(ds->prv); - } - -SET_INF_ERROR: - if (intNeg == 1) { - const char *msg = "Unexpected character found when decoding 'Infinity'"; - return SetError(ds, -1, msg); - } else { - const char *msg = "Unexpected character found when decoding '-Infinity'"; - return SetError(ds, -1, msg); - } - -BREAK_EXP_LOOP: - // FIXME: Check for arithmetic overflow here - ds->lastType = JT_DOUBLE; - ds->start = offset; - return ds->dec->newDouble( - ds->prv, - createDouble((double)intNeg, (double)intValue, frcValue, decimalCount) * - pow(10.0, expValue * expNeg)); -} - -JSOBJ FASTCALL_MSVC decode_true(struct DecoderState *ds) { - char *offset = ds->start; - offset++; - - if (*(offset++) != 'r') - goto SETERROR; - if (*(offset++) != 'u') - goto SETERROR; - if (*(offset++) != 'e') - goto SETERROR; - - ds->lastType = JT_TRUE; - ds->start = offset; - return ds->dec->newTrue(ds->prv); - -SETERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'true'"); -} - -JSOBJ FASTCALL_MSVC decode_false(struct DecoderState *ds) { - char *offset = ds->start; - offset++; - - if (*(offset++) != 'a') - goto SETERROR; - if (*(offset++) != 'l') - goto SETERROR; - if (*(offset++) != 's') - goto SETERROR; - if (*(offset++) != 'e') - goto SETERROR; - - ds->lastType = JT_FALSE; - ds->start = offset; - return ds->dec->newFalse(ds->prv); - -SETERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'false'"); -} - -JSOBJ FASTCALL_MSVC decode_null(struct DecoderState *ds) { - char *offset = ds->start; - offset++; - - if (*(offset++) != 'u') - goto SETERROR; - if (*(offset++) != 'l') - goto SETERROR; - if (*(offset++) != 'l') - goto SETERROR; - - ds->lastType = JT_NULL; - ds->start = offset; - return ds->dec->newNull(ds->prv); - -SETERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'null'"); -} - -void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) { - char *offset; - - for (offset = ds->start; (ds->end - offset) > 0; offset++) { - switch (*offset) { - case ' ': - case '\t': - case '\r': - case '\n': - break; - - default: - ds->start = offset; - return; - } - } - - if (offset == ds->end) { - ds->start = ds->end; - } -} - -enum DECODESTRINGSTATE { - DS_ISNULL = 0x32, - DS_ISQUOTE, - DS_ISESCAPE, - DS_UTFLENERROR, -}; - -static const JSUINT8 g_decoderLookup[256] = { - /* 0x00 */ DS_ISNULL, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0x10 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0x20 */ 1, - 1, - DS_ISQUOTE, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0x30 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0x40 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0x50 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - DS_ISESCAPE, - 1, - 1, - 1, - /* 0x60 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0x70 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0x80 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0x90 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0xa0 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0xb0 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0xc0 */ 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - /* 0xd0 */ 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - /* 0xe0 */ 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - /* 0xf0 */ 4, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - DS_UTFLENERROR, - DS_UTFLENERROR, - DS_UTFLENERROR, - DS_UTFLENERROR, - DS_UTFLENERROR, - DS_UTFLENERROR, - DS_UTFLENERROR, - DS_UTFLENERROR, -}; - -JSOBJ FASTCALL_MSVC decode_string(struct DecoderState *ds) { - JSUTF16 sur[2] = {0}; - int iSur = 0; - int index; - wchar_t *escOffset; - wchar_t *escStart; - size_t escLen = (ds->escEnd - ds->escStart); - JSUINT8 *inputOffset; - JSUINT8 oct; - JSUTF32 ucs; - ds->lastType = JT_INVALID; - ds->start++; - - if ((size_t)(ds->end - ds->start) > escLen) { - size_t newSize = (ds->end - ds->start); - - if (ds->escHeap) { - if (newSize > (SIZE_MAX / sizeof(wchar_t))) { - return SetError(ds, -1, "Could not reserve memory block"); - } - escStart = - (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t)); - if (!escStart) { - ds->dec->free(ds->escStart); - return SetError(ds, -1, "Could not reserve memory block"); - } - ds->escStart = escStart; - } else { - wchar_t *oldStart = ds->escStart; - if (newSize > (SIZE_MAX / sizeof(wchar_t))) { - return SetError(ds, -1, "Could not reserve memory block"); - } - ds->escStart = (wchar_t *)ds->dec->malloc(newSize * sizeof(wchar_t)); - if (!ds->escStart) { - return SetError(ds, -1, "Could not reserve memory block"); - } - ds->escHeap = 1; - memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t)); - } - - ds->escEnd = ds->escStart + newSize; - } - - escOffset = ds->escStart; - inputOffset = (JSUINT8 *)ds->start; - - for (;;) { - switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) { - case DS_ISNULL: { - return SetError(ds, -1, "Unmatched ''\"' when when decoding 'string'"); - } - case DS_ISQUOTE: { - ds->lastType = JT_UTF8; - inputOffset++; - ds->start += ((char *)inputOffset - (ds->start)); - return ds->dec->newString(ds->prv, ds->escStart, escOffset); - } - case DS_UTFLENERROR: { - return SetError(ds, -1, - "Invalid UTF-8 sequence length when decoding 'string'"); - } - case DS_ISESCAPE: - inputOffset++; - switch (*inputOffset) { - case '\\': - *(escOffset++) = L'\\'; - inputOffset++; - continue; - case '\"': - *(escOffset++) = L'\"'; - inputOffset++; - continue; - case '/': - *(escOffset++) = L'/'; - inputOffset++; - continue; - case 'b': - *(escOffset++) = L'\b'; - inputOffset++; - continue; - case 'f': - *(escOffset++) = L'\f'; - inputOffset++; - continue; - case 'n': - *(escOffset++) = L'\n'; - inputOffset++; - continue; - case 'r': - *(escOffset++) = L'\r'; - inputOffset++; - continue; - case 't': - *(escOffset++) = L'\t'; - inputOffset++; - continue; - - case 'u': { - int index; - inputOffset++; - - for (index = 0; index < 4; index++) { - switch (*inputOffset) { - case '\0': - return SetError(ds, -1, - "Unterminated unicode " - "escape sequence when " - "decoding 'string'"); - default: - return SetError(ds, -1, - "Unexpected character in " - "unicode escape sequence " - "when decoding 'string'"); - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - sur[iSur] = (sur[iSur] << 4) + (JSUTF16)(*inputOffset - '0'); - break; - - case 'a': - case 'b': - case 'c': - case 'd': - case 'e': - case 'f': - sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16)(*inputOffset - 'a'); - break; - - case 'A': - case 'B': - case 'C': - case 'D': - case 'E': - case 'F': - sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16)(*inputOffset - 'A'); - break; - } - - inputOffset++; - } - - if (iSur == 0) { - if ((sur[iSur] & 0xfc00) == 0xd800) { - // First of a surrogate pair, continue parsing - iSur++; - break; - } - (*escOffset++) = (wchar_t)sur[iSur]; - iSur = 0; - } else { - // Decode pair - if ((sur[1] & 0xfc00) != 0xdc00) { - return SetError(ds, -1, - "Unpaired high surrogate when " - "decoding 'string'"); - } -#if WCHAR_MAX == 0xffff - (*escOffset++) = (wchar_t)sur[0]; - (*escOffset++) = (wchar_t)sur[1]; -#else - (*escOffset++) = (wchar_t)0x10000 + - (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); -#endif - iSur = 0; - } - break; - } - - case '\0': - return SetError(ds, -1, - "Unterminated escape sequence when " - "decoding 'string'"); - default: - return SetError(ds, -1, - "Unrecognized escape sequence when " - "decoding 'string'"); - } - break; - - case 1: { - *(escOffset++) = (wchar_t)(*inputOffset++); - break; - } - - case 2: { - ucs = (*inputOffset++) & 0x1f; - ucs <<= 6; - if (((*inputOffset) & 0x80) != 0x80) { - return SetError(ds, -1, - "Invalid octet in UTF-8 sequence when " - "decoding 'string'"); - } - ucs |= (*inputOffset++) & 0x3f; - if (ucs < 0x80) - return SetError(ds, -1, - "Overlong 2 byte UTF-8 sequence detected " - "when decoding 'string'"); - *(escOffset++) = (wchar_t)ucs; - break; - } - - case 3: { - JSUTF32 ucs = 0; - ucs |= (*inputOffset++) & 0x0f; - - for (index = 0; index < 2; index++) { - ucs <<= 6; - oct = (*inputOffset++); - - if ((oct & 0x80) != 0x80) { - return SetError(ds, -1, - "Invalid octet in UTF-8 sequence when " - "decoding 'string'"); - } - - ucs |= oct & 0x3f; - } - - if (ucs < 0x800) - return SetError(ds, -1, - "Overlong 3 byte UTF-8 sequence detected " - "when encoding string"); - *(escOffset++) = (wchar_t)ucs; - break; - } - - case 4: { - JSUTF32 ucs = 0; - ucs |= (*inputOffset++) & 0x07; - - for (index = 0; index < 3; index++) { - ucs <<= 6; - oct = (*inputOffset++); - - if ((oct & 0x80) != 0x80) { - return SetError(ds, -1, - "Invalid octet in UTF-8 sequence when " - "decoding 'string'"); - } - - ucs |= oct & 0x3f; - } - - if (ucs < 0x10000) - return SetError(ds, -1, - "Overlong 4 byte UTF-8 sequence detected " - "when decoding 'string'"); - -#if WCHAR_MAX == 0xffff - if (ucs >= 0x10000) { - ucs -= 0x10000; - *(escOffset++) = (wchar_t)(ucs >> 10) + 0xd800; - *(escOffset++) = (wchar_t)(ucs & 0x3ff) + 0xdc00; - } else { - *(escOffset++) = (wchar_t)ucs; - } -#else - *(escOffset++) = (wchar_t)ucs; -#endif - break; - } - } - } -} - -JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) { - JSOBJ itemValue; - JSOBJ newObj; - int len; - ds->objDepth++; - if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { - return SetError(ds, -1, "Reached object decoding depth limit"); - } - - newObj = ds->dec->newArray(ds->prv, ds->dec); - len = 0; - - ds->lastType = JT_INVALID; - ds->start++; - - for (;;) { - SkipWhitespace(ds); - - if ((*ds->start) == ']') { - ds->objDepth--; - if (len == 0) { - ds->start++; - return ds->dec->endArray(ds->prv, newObj); - } - - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return SetError( - ds, -1, "Unexpected character found when decoding array value (1)"); - } - - itemValue = decode_any(ds); - - if (itemValue == NULL) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return NULL; - } - - if (!ds->dec->arrayAddItem(ds->prv, newObj, itemValue)) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return NULL; - } - - SkipWhitespace(ds); - - switch (*(ds->start++)) { - case ']': { - ds->objDepth--; - return ds->dec->endArray(ds->prv, newObj); - } - case ',': - break; - - default: - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return SetError( - ds, -1, "Unexpected character found when decoding array value (2)"); - } - - len++; - } -} - -JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) { - JSOBJ itemName; - JSOBJ itemValue; - JSOBJ newObj; - - ds->objDepth++; - if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { - return SetError(ds, -1, "Reached object decoding depth limit"); - } - - newObj = ds->dec->newObject(ds->prv, ds->dec); - - ds->start++; - - for (;;) { - SkipWhitespace(ds); - - if ((*ds->start) == '}') { - ds->objDepth--; - ds->start++; - return ds->dec->endObject(ds->prv, newObj); - } - - ds->lastType = JT_INVALID; - itemName = decode_any(ds); - - if (itemName == NULL) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return NULL; - } - - if (ds->lastType != JT_UTF8) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - return SetError( - ds, -1, "Key name of object must be 'string' when decoding 'object'"); - } - - SkipWhitespace(ds); - - if (*(ds->start++) != ':') { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - return SetError(ds, -1, "No ':' found when decoding object value"); - } - - SkipWhitespace(ds); - - itemValue = decode_any(ds); - - if (itemValue == NULL) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - return NULL; - } - - if (!ds->dec->objectAddKey(ds->prv, newObj, itemName, itemValue)) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - ds->dec->releaseObject(ds->prv, itemValue, ds->dec); - return NULL; - } - - SkipWhitespace(ds); - - switch (*(ds->start++)) { - case '}': { - ds->objDepth--; - return ds->dec->endObject(ds->prv, newObj); - } - case ',': - break; - - default: - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return SetError(ds, -1, - "Unexpected character found when decoding object value"); - } - } -} - -JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) { - for (;;) { - switch (*ds->start) { - case '\"': - return decode_string(ds); - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - case 'I': - case 'N': - case '-': - return decode_numeric(ds); - - case '[': - return decode_array(ds); - case '{': - return decode_object(ds); - case 't': - return decode_true(ds); - case 'f': - return decode_false(ds); - case 'n': - return decode_null(ds); - - case ' ': - case '\t': - case '\r': - case '\n': - // White space - ds->start++; - break; - - default: - return SetError(ds, -1, "Expected object or value"); - } - } -} - -JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, - size_t cbBuffer) { - /* - FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode - escaping doesn't run into the wall each time */ - char *locale; - struct DecoderState ds; - wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; - JSOBJ ret; - - ds.start = (char *)buffer; - ds.end = ds.start + cbBuffer; - - ds.escStart = escBuffer; - ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); - ds.escHeap = 0; - ds.prv = dec->prv; - ds.dec = dec; - ds.dec->errorStr = NULL; - ds.dec->errorOffset = NULL; - ds.objDepth = 0; - - ds.dec = dec; - - locale = setlocale(LC_NUMERIC, NULL); - if (!locale) { - return SetError(&ds, -1, "setlocale call failed"); - } - - if (strcmp(locale, "C")) { - size_t len = strlen(locale) + 1; - char *saved_locale = malloc(len); - if (saved_locale == NULL) { - return SetError(&ds, -1, "Could not reserve memory block"); - } - memcpy(saved_locale, locale, len); - setlocale(LC_NUMERIC, "C"); - ret = decode_any(&ds); - setlocale(LC_NUMERIC, saved_locale); - free(saved_locale); - } else { - ret = decode_any(&ds); - } - - if (ds.escHeap) { - dec->free(ds.escStart); - } - - SkipWhitespace(&ds); - - if (ds.start != ds.end && ret) { - dec->releaseObject(ds.prv, ret, ds.dec); - return SetError(&ds, -1, "Trailing data"); - } - - return ret; -} diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c deleted file mode 100644 index 1564ecb64b01d..0000000000000 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ /dev/null @@ -1,1205 +0,0 @@ -/* -Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the ESN Social Software AB nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE -LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights -reserved. - -Numeric decoder derived from TCL library -https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms - * Copyright (c) 1988-1993 The Regents of the University of California. - * Copyright (c) 1994 Sun Microsystems, Inc. -*/ - -// Licence at LICENSES/ULTRAJSON_LICENSE - -#include "pandas/portable.h" -#include "pandas/vendored/ujson/lib/ultrajson.h" -#include -#include -#include -#include -#include -#include - -#ifndef TRUE -#define TRUE 1 -#endif -#ifndef FALSE -#define FALSE 0 -#endif - -/* -Worst cases being: - -Control characters (ASCII < 32) -0x00 (1 byte) input => \u0000 output (6 bytes) -1 * 6 => 6 (6 bytes required) - -or UTF-16 surrogate pairs -4 bytes input in UTF-8 => \uXXXX\uYYYY (12 bytes). - -4 * 6 => 24 bytes (12 bytes required) - -The extra 2 bytes are for the quotes around the string - -*/ -#define RESERVE_STRING(_len) (2 + ((_len) * 6)) - -static const double g_pow10[] = {1, - 10, - 100, - 1000, - 10000, - 100000, - 1000000, - 10000000, - 100000000, - 1000000000, - 10000000000, - 100000000000, - 1000000000000, - 10000000000000, - 100000000000000, - 1000000000000000}; -static const char g_hexChars[] = "0123456789abcdef"; -static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\\\/"; - -/* -FIXME: While this is fine dandy and working it's a magic value mess which -probably only the author understands. -Needs a cleanup and more documentation */ - -/* -Table for pure ascii output escaping all characters above 127 to \uXXXX */ -static const JSUINT8 g_asciiOutputTable[256] = { - /* 0x00 */ 0, - 30, - 30, - 30, - 30, - 30, - 30, - 30, - 10, - 12, - 14, - 30, - 16, - 18, - 30, - 30, - /* 0x10 */ 30, - 30, - 30, - 30, - 30, - 30, - 30, - 30, - 30, - 30, - 30, - 30, - 30, - 30, - 30, - 30, - /* 0x20 */ 1, - 1, - 20, - 1, - 1, - 1, - 29, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 24, - /* 0x30 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 29, - 1, - 29, - 1, - /* 0x40 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0x50 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 22, - 1, - 1, - 1, - /* 0x60 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0x70 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0x80 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0x90 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0xa0 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0xb0 */ 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - /* 0xc0 */ 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - /* 0xd0 */ 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - /* 0xe0 */ 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - /* 0xf0 */ 4, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 5, - 5, - 5, - 5, - 6, - 6, - 1, - 1}; - -static void SetError(JSOBJ obj, JSONObjectEncoder *enc, const char *message) { - enc->errorMsg = message; - enc->errorObj = obj; -} - -/* -FIXME: Keep track of how big these get across several encoder calls and try to -make an estimate -That way we won't run our head into the wall each call */ -void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded) { - size_t curSize = enc->end - enc->start; - size_t newSize = curSize * 2; - size_t offset = enc->offset - enc->start; - - while (newSize < curSize + cbNeeded) { - newSize *= 2; - } - - if (enc->heap) { - enc->start = (char *)enc->realloc(enc->start, newSize); - if (!enc->start) { - SetError(NULL, enc, "Could not reserve memory block"); - return; - } - } else { - char *oldStart = enc->start; - enc->heap = 1; - enc->start = (char *)enc->malloc(newSize); - if (!enc->start) { - SetError(NULL, enc, "Could not reserve memory block"); - return; - } - memcpy(enc->start, oldStart, offset); - } - enc->offset = enc->start + offset; - enc->end = enc->start + newSize; -} - -INLINE_PREFIX void FASTCALL_MSVC -Buffer_AppendShortHexUnchecked(char *outputOffset, unsigned short value) { - *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12]; - *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8]; - *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4]; - *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; -} - -int Buffer_EscapeStringUnvalidated(JSONObjectEncoder *enc, const char *io, - const char *end) { - char *of = (char *)enc->offset; - - for (;;) { - switch (*io) { - case 0x00: { - if (io < end) { - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - break; - } else { - enc->offset += (of - enc->offset); - return TRUE; - } - } - case '\"': - (*of++) = '\\'; - (*of++) = '\"'; - break; - case '\\': - (*of++) = '\\'; - (*of++) = '\\'; - break; - case '/': - (*of++) = '\\'; - (*of++) = '/'; - break; - case '\b': - (*of++) = '\\'; - (*of++) = 'b'; - break; - case '\f': - (*of++) = '\\'; - (*of++) = 'f'; - break; - case '\n': - (*of++) = '\\'; - (*of++) = 'n'; - break; - case '\r': - (*of++) = '\\'; - (*of++) = 'r'; - break; - case '\t': - (*of++) = '\\'; - (*of++) = 't'; - break; - - case 0x26: // '/' - case 0x3c: // '<' - case 0x3e: // '>' - { - if (enc->encodeHTMLChars) { - // Fall through to \u00XX case below. - PD_FALLTHROUGH; - } else { - // Same as default case below. - (*of++) = (*io); - break; - } - } - case 0x01: - case 0x02: - case 0x03: - case 0x04: - case 0x05: - case 0x06: - case 0x07: - case 0x0b: - case 0x0e: - case 0x0f: - case 0x10: - case 0x11: - case 0x12: - case 0x13: - case 0x14: - case 0x15: - case 0x16: - case 0x17: - case 0x18: - case 0x19: - case 0x1a: - case 0x1b: - case 0x1c: - case 0x1d: - case 0x1e: - case 0x1f: { - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)]; - *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)]; - break; - } - default: - (*of++) = (*io); - break; - } - io++; - } -} - -int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, - const char *io, const char *end) { - JSUTF32 ucs; - char *of = (char *)enc->offset; - - for (;;) { - JSUINT8 utflen = g_asciiOutputTable[(unsigned char)*io]; - - switch (utflen) { - case 0: { - if (io < end) { - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - io++; - continue; - } else { - enc->offset += (of - enc->offset); - return TRUE; - } - } - - case 1: { - *(of++) = (*io++); - continue; - } - - case 2: { - JSUTF32 in; - JSUTF16 in16; - - if (end - io < 1) { - enc->offset += (of - enc->offset); - SetError(obj, enc, "Unterminated UTF-8 sequence when encoding string"); - return FALSE; - } - - memcpy(&in16, io, sizeof(JSUTF16)); - in = (JSUTF32)in16; - -#ifdef __LITTLE_ENDIAN__ - ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f); -#else - ucs = ((in & 0x1f00) >> 2) | (in & 0x3f); -#endif - - if (ucs < 0x80) { - enc->offset += (of - enc->offset); - SetError(obj, enc, - "Overlong 2 byte UTF-8 sequence detected when " - "encoding string"); - return FALSE; - } - - io += 2; - break; - } - - case 3: { - JSUTF32 in; - JSUTF16 in16; - JSUINT8 in8; - - if (end - io < 2) { - enc->offset += (of - enc->offset); - SetError(obj, enc, "Unterminated UTF-8 sequence when encoding string"); - return FALSE; - } - - memcpy(&in16, io, sizeof(JSUTF16)); - memcpy(&in8, io + 2, sizeof(JSUINT8)); -#ifdef __LITTLE_ENDIAN__ - in = (JSUTF32)in16; - in |= in8 << 16; - ucs = - ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) | ((in & 0x3f0000) >> 16); -#else - in = in16 << 8; - in |= in8; - ucs = ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); -#endif - - if (ucs < 0x800) { - enc->offset += (of - enc->offset); - SetError(obj, enc, - "Overlong 3 byte UTF-8 sequence detected when " - "encoding string"); - return FALSE; - } - - io += 3; - break; - } - case 4: { - JSUTF32 in; - - if (end - io < 3) { - enc->offset += (of - enc->offset); - SetError(obj, enc, "Unterminated UTF-8 sequence when encoding string"); - return FALSE; - } - - memcpy(&in, io, sizeof(JSUTF32)); -#ifdef __LITTLE_ENDIAN__ - ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) | - ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24); -#else - ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) | - ((in & 0x3f00) >> 2) | (in & 0x3f); -#endif - if (ucs < 0x10000) { - enc->offset += (of - enc->offset); - SetError(obj, enc, - "Overlong 4 byte UTF-8 sequence detected when " - "encoding string"); - return FALSE; - } - - io += 4; - break; - } - - case 5: - case 6: { - enc->offset += (of - enc->offset); - SetError(obj, enc, - "Unsupported UTF-8 sequence length when encoding string"); - return FALSE; - } - - case 29: { - if (enc->encodeHTMLChars) { - // Fall through to \u00XX case 30 below. - PD_FALLTHROUGH; - } else { - // Same as case 1 above. - *(of++) = (*io++); - continue; - } - } - - case 30: { - // \uXXXX encode - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)]; - *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)]; - io++; - continue; - } - case 10: - case 12: - case 14: - case 16: - case 18: - case 20: - case 22: - case 24: { - *(of++) = *((char *)(g_escapeChars + utflen + 0)); - *(of++) = *((char *)(g_escapeChars + utflen + 1)); - io++; - continue; - } - // This can never happen, it's here to make L4 VC++ happy - default: { - ucs = 0; - break; - } - } - - /* - If the character is a UTF8 sequence of length > 1 we end up here */ - if (ucs >= 0x10000) { - ucs -= 0x10000; - *(of++) = '\\'; - *(of++) = 'u'; - Buffer_AppendShortHexUnchecked(of, (unsigned short)(ucs >> 10) + 0xd800); - of += 4; - - *(of++) = '\\'; - *(of++) = 'u'; - Buffer_AppendShortHexUnchecked(of, - (unsigned short)(ucs & 0x3ff) + 0xdc00); - of += 4; - } else { - *(of++) = '\\'; - *(of++) = 'u'; - Buffer_AppendShortHexUnchecked(of, (unsigned short)ucs); - of += 4; - } - } -} - -#define Buffer_Reserve(__enc, __len) \ - if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ - Buffer_Realloc((__enc), (__len)); \ - } - -#define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr; - -INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, char *end) { - char aux; - while (end > begin) - aux = *end, *end-- = *begin, *begin++ = aux; -} - -void Buffer_AppendIndentNewlineUnchecked(JSONObjectEncoder *enc) { - if (enc->indent > 0) - Buffer_AppendCharUnchecked(enc, '\n'); -} - -// This function could be refactored to only accept enc as an argument, -// but this is a straight vendor from ujson source -void Buffer_AppendIndentUnchecked(JSONObjectEncoder *enc, JSINT32 value) { - int i; - if (enc->indent > 0) { - while (value-- > 0) - for (i = 0; i < enc->indent; i++) - Buffer_AppendCharUnchecked(enc, ' '); - } -} - -void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) { - char *wstr; - JSUINT32 uvalue = (value < 0) ? -value : value; - wstr = enc->offset; - - // Conversion. Number is reversed. - do { - *wstr++ = (char)(48 + (uvalue % 10)); - } while (uvalue /= 10); - if (value < 0) - *wstr++ = '-'; - - // Reverse string - strreverse(enc->offset, wstr - 1); - enc->offset += (wstr - (enc->offset)); -} - -void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) { - char *wstr; - JSUINT64 uvalue; - if (value == INT64_MIN) { - uvalue = INT64_MAX + UINT64_C(1); - } else { - uvalue = (value < 0) ? -value : value; - } - - wstr = enc->offset; - // Conversion. Number is reversed. - - do { - *wstr++ = (char)(48 + (uvalue % 10ULL)); - } while (uvalue /= 10ULL); - if (value < 0) - *wstr++ = '-'; - - // Reverse string - strreverse(enc->offset, wstr - 1); - enc->offset += (wstr - (enc->offset)); -} - -int Buffer_AppendDoubleUnchecked(JSOBJ obj, JSONObjectEncoder *enc, - double value) { - /* if input is beyond the thresholds, revert to exponential */ - const double thres_max = (double)1e16 - 1; - const double thres_min = (double)1e-15; - char precision_str[20]; - int count; - double diff = 0.0; - char *str = enc->offset; - char *wstr = str; - unsigned long long whole; - double tmp; - unsigned long long frac; - int neg; - double pow10; - - if (value == HUGE_VAL || value == -HUGE_VAL) { - SetError(obj, enc, "Invalid Inf value when encoding double"); - return FALSE; - } - - if (!(value == value)) { - SetError(obj, enc, "Invalid Nan value when encoding double"); - return FALSE; - } - - /* we'll work in positive values and deal with the - negative sign issue later */ - neg = 0; - if (value < 0) { - neg = 1; - value = -value; - } - - /* - for very large or small numbers switch back to native sprintf for - exponentials. anyone want to write code to replace this? */ - if (value > thres_max || (value != 0.0 && fabs(value) < thres_min)) { - precision_str[0] = '%'; - precision_str[1] = '.'; -#if defined(_WIN32) && defined(_MSC_VER) - sprintf_s(precision_str + 2, sizeof(precision_str) - 2, "%ug", - enc->doublePrecision); - enc->offset += sprintf_s(str, enc->end - enc->offset, precision_str, - neg ? -value : value); -#else - snprintf(precision_str + 2, sizeof(precision_str) - 2, "%ug", - enc->doublePrecision); - enc->offset += snprintf(str, enc->end - enc->offset, precision_str, - neg ? -value : value); -#endif - return TRUE; - } - - pow10 = g_pow10[enc->doublePrecision]; - - whole = (unsigned long long)value; - tmp = (value - whole) * pow10; - frac = (unsigned long long)(tmp); - diff = tmp - frac; - - if (diff > 0.5) { - ++frac; - } else if (diff == 0.5 && ((frac == 0) || (frac & 1))) { - /* if halfway, round up if odd, OR - if last digit is 0. That last part is strange */ - ++frac; - } - - // handle rollover, e.g. - // case 0.99 with prec 1 is 1.0 and case 0.95 with prec is 1.0 as well - if (frac >= pow10) { - frac = 0; - ++whole; - } - - if (enc->doublePrecision == 0) { - diff = value - whole; - - if (diff > 0.5) { - /* greater than 0.5, round up, e.g. 1.6 -> 2 */ - ++whole; - } else if (diff == 0.5 && (whole & 1)) { - /* exactly 0.5 and ODD, then round up */ - /* 1.5 -> 2, but 2.5 -> 2 */ - ++whole; - } - - // vvvvvvvvvvvvvvvvvvv Diff from modp_dto2 - } else if (frac) { - count = enc->doublePrecision; - // now do fractional part, as an unsigned number - // we know it is not 0 but we can have leading zeros, these - // should be removed - while (!(frac % 10)) { - --count; - frac /= 10; - } - //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2 - - // now do fractional part, as an unsigned number - do { - --count; - *wstr++ = (char)(48 + (frac % 10)); - } while (frac /= 10); - // add extra 0s - while (count-- > 0) { - *wstr++ = '0'; - } - // add decimal - *wstr++ = '.'; - } else { - *wstr++ = '0'; - *wstr++ = '.'; - } - - // Do whole part. Take care of sign - // conversion. Number is reversed. - do { - *wstr++ = (char)(48 + (whole % 10)); - } while (whole /= 10); - - if (neg) { - *wstr++ = '-'; - } - strreverse(str, wstr - 1); - enc->offset += (wstr - (enc->offset)); - - return TRUE; -} - -/* -FIXME: -Handle integration functions returning NULL here */ - -/* -FIXME: -Perhaps implement recursion detection */ - -void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, - size_t cbName) { - const char *value; - const char *objName; - int count; - JSOBJ iterObj; - size_t szlen; - JSONTypeContext tc; - tc.encoder = enc; - - if (enc->level > enc->recursionMax) { - SetError(obj, enc, "Maximum recursion level reached"); - return; - } - - /* - This reservation must hold - - length of _name as encoded worst case + - maxLength of double to string OR maxLength of JSLONG to string - */ - - Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName)); - if (enc->errorMsg) { - return; - } - - if (name) { - Buffer_AppendCharUnchecked(enc, '\"'); - - if (enc->forceASCII) { - if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) { - return; - } - } else { - if (!Buffer_EscapeStringUnvalidated(enc, name, name + cbName)) { - return; - } - } - - Buffer_AppendCharUnchecked(enc, '\"'); - - Buffer_AppendCharUnchecked(enc, ':'); -#ifndef JSON_NO_EXTRA_WHITESPACE - Buffer_AppendCharUnchecked(enc, ' '); -#endif - } - - enc->beginTypeContext(obj, &tc); - - switch (tc.type) { - case JT_INVALID: { - return; - } - - case JT_ARRAY: { - count = 0; - enc->iterBegin(obj, &tc); - - Buffer_AppendCharUnchecked(enc, '['); - Buffer_AppendIndentNewlineUnchecked(enc); - - while (enc->iterNext(obj, &tc)) { - if (count > 0) { - Buffer_AppendCharUnchecked(enc, ','); -#ifndef JSON_NO_EXTRA_WHITESPACE - Buffer_AppendCharUnchecked(buffer, ' '); -#endif - Buffer_AppendIndentNewlineUnchecked(enc); - } - - iterObj = enc->iterGetValue(obj, &tc); - - enc->level++; - Buffer_AppendIndentUnchecked(enc, enc->level); - encode(iterObj, enc, NULL, 0); - count++; - } - - enc->iterEnd(obj, &tc); - Buffer_AppendIndentNewlineUnchecked(enc); - Buffer_AppendIndentUnchecked(enc, enc->level); - Buffer_AppendCharUnchecked(enc, ']'); - break; - } - - case JT_OBJECT: { - count = 0; - enc->iterBegin(obj, &tc); - - Buffer_AppendCharUnchecked(enc, '{'); - Buffer_AppendIndentNewlineUnchecked(enc); - - while (enc->iterNext(obj, &tc)) { - if (count > 0) { - Buffer_AppendCharUnchecked(enc, ','); -#ifndef JSON_NO_EXTRA_WHITESPACE - Buffer_AppendCharUnchecked(enc, ' '); -#endif - Buffer_AppendIndentNewlineUnchecked(enc); - } - - iterObj = enc->iterGetValue(obj, &tc); - objName = enc->iterGetName(obj, &tc, &szlen); - - enc->level++; - Buffer_AppendIndentUnchecked(enc, enc->level); - encode(iterObj, enc, objName, szlen); - count++; - } - - enc->iterEnd(obj, &tc); - Buffer_AppendIndentNewlineUnchecked(enc); - Buffer_AppendIndentUnchecked(enc, enc->level); - Buffer_AppendCharUnchecked(enc, '}'); - break; - } - - case JT_LONG: { - Buffer_AppendLongUnchecked(enc, enc->getLongValue(obj, &tc)); - break; - } - - case JT_INT: { - Buffer_AppendIntUnchecked(enc, enc->getIntValue(obj, &tc)); - break; - } - - case JT_TRUE: { - Buffer_AppendCharUnchecked(enc, 't'); - Buffer_AppendCharUnchecked(enc, 'r'); - Buffer_AppendCharUnchecked(enc, 'u'); - Buffer_AppendCharUnchecked(enc, 'e'); - break; - } - - case JT_FALSE: { - Buffer_AppendCharUnchecked(enc, 'f'); - Buffer_AppendCharUnchecked(enc, 'a'); - Buffer_AppendCharUnchecked(enc, 'l'); - Buffer_AppendCharUnchecked(enc, 's'); - Buffer_AppendCharUnchecked(enc, 'e'); - break; - } - - case JT_NULL: { - Buffer_AppendCharUnchecked(enc, 'n'); - Buffer_AppendCharUnchecked(enc, 'u'); - Buffer_AppendCharUnchecked(enc, 'l'); - Buffer_AppendCharUnchecked(enc, 'l'); - break; - } - - case JT_DOUBLE: { - if (!Buffer_AppendDoubleUnchecked(obj, enc, - enc->getDoubleValue(obj, &tc))) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - break; - } - - case JT_UTF8: { - value = enc->getStringValue(obj, &tc, &szlen); - if (enc->errorMsg) { - enc->endTypeContext(obj, &tc); - return; - } - Buffer_Reserve(enc, RESERVE_STRING(szlen)); - Buffer_AppendCharUnchecked(enc, '\"'); - - if (enc->forceASCII) { - if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } else { - if (!Buffer_EscapeStringUnvalidated(enc, value, value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } - - Buffer_AppendCharUnchecked(enc, '\"'); - break; - } - - case JT_BIGNUM: { - value = enc->getBigNumStringValue(obj, &tc, &szlen); - - Buffer_Reserve(enc, RESERVE_STRING(szlen)); - if (enc->errorMsg) { - enc->endTypeContext(obj, &tc); - return; - } - - if (enc->forceASCII) { - if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } else { - if (!Buffer_EscapeStringUnvalidated(enc, value, value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } - - break; - } - } - - enc->endTypeContext(obj, &tc); - enc->level--; -} - -char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, - size_t _cbBuffer) { - char *locale; - enc->malloc = enc->malloc ? enc->malloc : malloc; - enc->free = enc->free ? enc->free : free; - enc->realloc = enc->realloc ? enc->realloc : realloc; - enc->errorMsg = NULL; - enc->errorObj = NULL; - enc->level = 0; - - if (enc->recursionMax < 1) { - enc->recursionMax = JSON_MAX_RECURSION_DEPTH; - } - - if (enc->doublePrecision < 0 || - enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) { - enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; - } - - if (_buffer == NULL) { - _cbBuffer = 32768; - enc->start = (char *)enc->malloc(_cbBuffer); - if (!enc->start) { - SetError(obj, enc, "Could not reserve memory block"); - return NULL; - } - enc->heap = 1; - } else { - enc->start = _buffer; - enc->heap = 0; - } - - enc->end = enc->start + _cbBuffer; - enc->offset = enc->start; - - locale = setlocale(LC_NUMERIC, NULL); - if (!locale) { - SetError(NULL, enc, "setlocale call failed"); - return NULL; - } - - if (strcmp(locale, "C")) { - size_t len = strlen(locale) + 1; - char *saved_locale = malloc(len); - if (saved_locale == NULL) { - SetError(NULL, enc, "Could not reserve memory block"); - return NULL; - } - memcpy(saved_locale, locale, len); - setlocale(LC_NUMERIC, "C"); - encode(obj, enc, NULL, 0); - setlocale(LC_NUMERIC, saved_locale); - free(saved_locale); - } else { - encode(obj, enc, NULL, 0); - } - - Buffer_Reserve(enc, 1); - if (enc->errorMsg) { - return NULL; - } - Buffer_AppendCharUnchecked(enc, '\0'); - - return enc->start; -} diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c deleted file mode 100644 index ef6f1104a1fb9..0000000000000 --- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c +++ /dev/null @@ -1,171 +0,0 @@ -/* -Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the ESN Social Software AB nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE -GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights -reserved. - -Numeric decoder derived from TCL library -https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms - * Copyright (c) 1988-1993 The Regents of the University of California. - * Copyright (c) 1994 Sun Microsystems, Inc. -*/ - -// Licence at LICENSES/ULTRAJSON_LICENSE - -#define PY_SSIZE_T_CLEAN -#include - -#include "pandas/vendored/ujson/lib/ultrajson.h" - -static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name, - JSOBJ value) { - int ret = PyDict_SetItem(obj, name, value); - Py_DECREF((PyObject *)name); - Py_DECREF((PyObject *)value); - return ret == 0 ? 1 : 0; -} - -static int Object_arrayAddItem(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ value) { - int ret = PyList_Append(obj, value); - Py_DECREF((PyObject *)value); - return ret == 0 ? 1 : 0; -} - -static JSOBJ Object_newString(void *Py_UNUSED(prv), wchar_t *start, - wchar_t *end) { - return PyUnicode_FromWideChar(start, (end - start)); -} - -static JSOBJ Object_newTrue(void *Py_UNUSED(prv)) { Py_RETURN_TRUE; } - -static JSOBJ Object_newFalse(void *Py_UNUSED(prv)) { Py_RETURN_FALSE; } - -static JSOBJ Object_newNull(void *Py_UNUSED(prv)) { Py_RETURN_NONE; } - -static JSOBJ Object_newPosInf(void *Py_UNUSED(prv)) { - return PyFloat_FromDouble(Py_HUGE_VAL); -} - -static JSOBJ Object_newNegInf(void *Py_UNUSED(prv)) { - return PyFloat_FromDouble(-Py_HUGE_VAL); -} - -static JSOBJ Object_newObject(void *Py_UNUSED(prv), void *Py_UNUSED(decoder)) { - return PyDict_New(); -} - -static JSOBJ Object_endObject(void *Py_UNUSED(prv), JSOBJ obj) { return obj; } - -static JSOBJ Object_newArray(void *Py_UNUSED(prv), void *Py_UNUSED(decoder)) { - return PyList_New(0); -} - -static JSOBJ Object_endArray(void *Py_UNUSED(prv), JSOBJ obj) { return obj; } - -static JSOBJ Object_newInteger(void *Py_UNUSED(prv), JSINT32 value) { - return PyLong_FromLong(value); -} - -static JSOBJ Object_newLong(void *Py_UNUSED(prv), JSINT64 value) { - return PyLong_FromLongLong(value); -} - -static JSOBJ Object_newUnsignedLong(void *Py_UNUSED(prv), JSUINT64 value) { - return PyLong_FromUnsignedLongLong(value); -} - -static JSOBJ Object_newDouble(void *Py_UNUSED(prv), double value) { - return PyFloat_FromDouble(value); -} - -static void Object_releaseObject(void *Py_UNUSED(prv), JSOBJ obj, - void *Py_UNUSED(decoder)) { - Py_XDECREF(((PyObject *)obj)); -} - -PyObject *JSONToObj(PyObject *Py_UNUSED(self), PyObject *args, - PyObject *kwargs) { - JSONObjectDecoder dec = {.newString = Object_newString, - .objectAddKey = Object_objectAddKey, - .arrayAddItem = Object_arrayAddItem, - .newTrue = Object_newTrue, - .newFalse = Object_newFalse, - .newNull = Object_newNull, - .newPosInf = Object_newPosInf, - .newNegInf = Object_newNegInf, - .newObject = Object_newObject, - .endObject = Object_endObject, - .newArray = Object_newArray, - .endArray = Object_endArray, - .newInt = Object_newInteger, - .newLong = Object_newLong, - .newUnsignedLong = Object_newUnsignedLong, - .newDouble = Object_newDouble, - .releaseObject = Object_releaseObject, - .malloc = PyObject_Malloc, - .free = PyObject_Free, - .realloc = PyObject_Realloc, - .errorStr = NULL, - .errorOffset = NULL, - .preciseFloat = 0, - .prv = NULL}; - - char *kwlist[] = {"obj", "precise_float", NULL}; - char *buf; - Py_ssize_t len; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|b", kwlist, &buf, &len, - &dec.preciseFloat)) { - return NULL; - } - - PyObject *ret = JSON_DecodeObject(&dec, buf, len); - - if (PyErr_Occurred()) { - if (ret) { - Py_DECREF((PyObject *)ret); - } - return NULL; - } - - if (dec.errorStr) { - /* - FIXME: It's possible to give a much nicer error message here with actual - failing element in input etc*/ - - PyErr_Format(PyExc_ValueError, "%s", dec.errorStr); - - if (ret) { - Py_DECREF((PyObject *)ret); - } - - return NULL; - } - - return ret; -} diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c deleted file mode 100644 index 8342dbcd1763d..0000000000000 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ /dev/null @@ -1,2100 +0,0 @@ -/* -Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: -* Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. -* Neither the name of the ESN Social Software AB nor the -names of its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE -GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights -reserved. - -Numeric decoder derived from TCL library -https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms -* Copyright (c) 1988-1993 The Regents of the University of California. -* Copyright (c) 1994 Sun Microsystems, Inc. -*/ - -// Licence at LICENSES/ULTRAJSON_LICENSE - -#define PY_SSIZE_T_CLEAN -#include - -#define NO_IMPORT_ARRAY -#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY -#include "datetime.h" -#include "pandas/datetime/pd_datetime.h" -#include "pandas/vendored/ujson/lib/ultrajson.h" -#include -#include -#include -#include - -npy_int64 get_nat(void) { return NPY_MIN_INT64; } - -typedef const char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, - size_t *_outLen); - -int object_is_decimal_type(PyObject *obj); -int object_is_dataframe_type(PyObject *obj); -int object_is_series_type(PyObject *obj); -int object_is_index_type(PyObject *obj); -int object_is_nat_type(PyObject *obj); -int object_is_na_type(PyObject *obj); - -typedef struct __NpyArrContext { - PyObject *array; - char *dataptr; - npy_intp curdim; // current dimension in array's order - npy_intp stridedim; // dimension we are striding over - int inc; // stride dimension increment (+/- 1) - npy_intp dim; - npy_intp stride; - npy_intp ndim; - npy_intp index[NPY_MAXDIMS]; - int type_num; - - char **rowLabels; - char **columnLabels; -} NpyArrContext; - -typedef struct __PdBlockContext { - Py_ssize_t colIdx; - Py_ssize_t ncols; - int transpose; - - NpyArrContext **npyCtxts; // NpyArrContext for each column -} PdBlockContext; - -typedef struct __TypeContext { - JSPFN_ITERBEGIN iterBegin; - JSPFN_ITEREND iterEnd; - JSPFN_ITERNEXT iterNext; - JSPFN_ITERGETNAME iterGetName; - JSPFN_ITERGETVALUE iterGetValue; - PFN_PyTypeToUTF8 PyTypeToUTF8; - PyObject *newObj; - PyObject *dictObj; - Py_ssize_t index; - Py_ssize_t size; - PyObject *itemValue; - PyObject *itemName; - PyObject *attrList; - PyObject *iterator; - - double doubleValue; - JSINT64 longValue; - - const char *cStr; - NpyArrContext *npyarr; - PdBlockContext *pdblock; - int transpose; - char **rowLabels; - char **columnLabels; - npy_intp rowLabelsLen; - npy_intp columnLabelsLen; -} TypeContext; - -typedef struct __PyObjectEncoder { - JSONObjectEncoder enc; - - // pass through the NpyArrContext when encoding multi-dimensional arrays - NpyArrContext *npyCtxtPassthru; - - // pass through the PdBlockContext when encoding blocks - PdBlockContext *blkCtxtPassthru; - - // pass-through to encode numpy data directly - int npyType; - void *npyValue; - - int datetimeIso; - NPY_DATETIMEUNIT datetimeUnit; - NPY_DATETIMEUNIT valueUnit; - - // output format style for pandas data types - int outputFormat; - int originalOutputFormat; - - PyObject *defaultHandler; -} PyObjectEncoder; - -#define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv)) - -enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; - -static int PdBlock_iterNext(JSOBJ, JSONTypeContext *); - -static TypeContext *createTypeContext(void) { - TypeContext *pc = PyObject_Malloc(sizeof(TypeContext)); - if (!pc) { - PyErr_NoMemory(); - return NULL; - } - pc->newObj = NULL; - pc->dictObj = NULL; - pc->itemValue = NULL; - pc->itemName = NULL; - pc->attrList = NULL; - pc->index = 0; - pc->size = 0; - pc->longValue = 0; - pc->doubleValue = 0.0; - pc->cStr = NULL; - pc->npyarr = NULL; - pc->pdblock = NULL; - pc->rowLabels = NULL; - pc->columnLabels = NULL; - pc->transpose = 0; - pc->rowLabelsLen = 0; - pc->columnLabelsLen = 0; - - return pc; -} - -static PyObject *get_values(PyObject *obj) { - PyObject *values = NULL; - - if (object_is_index_type(obj) || object_is_series_type(obj)) { - // The special cases to worry about are dt64tz and category[dt64tz]. - // In both cases we want the UTC-localized datetime64 ndarray, - // without going through and object array of Timestamps. - if (PyObject_HasAttrString(obj, "tz")) { - PyObject *tz = PyObject_GetAttrString(obj, "tz"); - if (tz != Py_None) { - // Go through object array if we have dt64tz, since tz info will - // be lost if values is used directly. - Py_DECREF(tz); - values = PyObject_CallMethod(obj, "__array__", NULL); - return values; - } - Py_DECREF(tz); - } - values = PyObject_GetAttrString(obj, "values"); - if (values == NULL) { - // Clear so we can subsequently try another method - PyErr_Clear(); - } else if (PyObject_HasAttrString(values, "__array__")) { - // We may have gotten a Categorical or Sparse array so call np.array - PyObject *array_values = PyObject_CallMethod(values, "__array__", NULL); - Py_DECREF(values); - values = array_values; - } else if (!PyArray_CheckExact(values)) { - // Didn't get a numpy array, so keep trying - Py_DECREF(values); - values = NULL; - } - } - - if (values == NULL) { - PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); - PyObject *repr; - if (PyObject_HasAttrString(obj, "dtype")) { - PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); - repr = PyObject_Repr(dtype); - Py_DECREF(dtype); - } else { - repr = PyUnicode_FromString(""); - } - - PyErr_Format(PyExc_ValueError, "%R or %R are not JSON serializable yet", - repr, typeRepr); - Py_DECREF(repr); - Py_DECREF(typeRepr); - - return NULL; - } - - return values; -} - -static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) { - PyObject *tmp = PyObject_GetAttrString(obj, attr); - if (tmp == 0) { - return 0; - } - PyObject *ret = PyObject_GetAttrString(tmp, subAttr); - Py_DECREF(tmp); - - return ret; -} - -static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { - PyObject *tmp = PyObject_GetAttrString(obj, attr); - if (tmp == 0) { - return 0; - } - Py_ssize_t ret = PyObject_Length(tmp); - Py_DECREF(tmp); - - if (ret == -1) { - return 0; - } - - return ret; -} - -static npy_int64 get_long_attr(PyObject *o, const char *attr) { - // NB we are implicitly assuming that o is a Timedelta or Timestamp, or NaT - - PyObject *value = PyObject_GetAttrString(o, attr); - const npy_int64 long_val = - (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyLong_AsLong(value)); - - Py_DECREF(value); - - if (object_is_nat_type(o)) { - // i.e. o is NaT, long_val will be NPY_MIN_INT64 - return long_val; - } - - // ensure we are in nanoseconds, similar to Timestamp._as_creso or _as_unit - PyObject *reso = PyObject_GetAttrString(o, "_creso"); - if (!PyLong_Check(reso)) { - // https://github.com/pandas-dev/pandas/pull/49034#discussion_r1023165139 - Py_DECREF(reso); - return -1; - } - - long cReso = PyLong_AsLong(reso); - Py_DECREF(reso); - if (cReso == -1 && PyErr_Occurred()) { - return -1; - } - - if (cReso == NPY_FR_us) { - return long_val * 1000L; - } else if (cReso == NPY_FR_ms) { - return long_val * 1000000L; - } else if (cReso == NPY_FR_s) { - return long_val * 1000000000L; - } - - return long_val; -} - -static npy_float64 total_seconds(PyObject *td) { - PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); - const npy_float64 double_val = PyFloat_AS_DOUBLE(value); - Py_DECREF(value); - return double_val; -} - -static const char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), - size_t *_outLen) { - PyObject *obj = (PyObject *)_obj; - *_outLen = PyBytes_GET_SIZE(obj); - return PyBytes_AS_STRING(obj); -} - -static const char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, - size_t *_outLen) { - char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); - if (encoded == NULL) { - /* Something went wrong. - Set errorMsg(to tell encoder to stop), - and let Python exception propagate. */ - JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; - enc->errorMsg = "Encoding failed."; - } - return encoded; -} - -/* JSON callback. returns a char* and mutates the pointer to *len */ -static const char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { - NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; - GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); - return GET_TC(tc)->cStr; -} - -/* JSON callback. returns a char* and mutates the pointer to *len */ -static const char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { - GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len); - return GET_TC(tc)->cStr; -} - -/* JSON callback */ -static const char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, - size_t *len) { - if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) { - PyErr_SetString(PyExc_TypeError, "Expected date or datetime object"); - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - return NULL; - } - - NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - return PyDateTimeToIso(obj, base, len); -} - -static const char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, - size_t *outLen) { - PyObject *obj = (PyObject *)_obj; - PyObject *str = PyObject_CallMethod(obj, "isoformat", NULL); - if (str == NULL) { - *outLen = 0; - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, "Failed to convert time"); - } - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - return NULL; - } - if (PyUnicode_Check(str)) { - PyObject *tmp = str; - str = PyUnicode_AsUTF8String(str); - Py_DECREF(tmp); - } - - GET_TC(tc)->newObj = str; - - *outLen = PyBytes_GET_SIZE(str); - char *outValue = PyBytes_AS_STRING(str); - return outValue; -} - -static const char *PyDecimalToUTF8Callback(JSOBJ _obj, JSONTypeContext *tc, - size_t *len) { - PyObject *obj = (PyObject *)_obj; - PyObject *format_spec = PyUnicode_FromStringAndSize("f", 1); - PyObject *str = PyObject_Format(obj, format_spec); - Py_DECREF(format_spec); - - if (str == NULL) { - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - return NULL; - } - - GET_TC(tc)->newObj = str; - - Py_ssize_t s_len; - char *outValue = (char *)PyUnicode_AsUTF8AndSize(str, &s_len); - *len = s_len; - - return outValue; -} - -//============================================================================= -// Numpy array iteration functions -//============================================================================= - -static void NpyArr_freeItemValue(JSOBJ Py_UNUSED(_obj), JSONTypeContext *tc) { - if (GET_TC(tc)->npyarr && - GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) { - Py_XDECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } -} - -static int NpyArr_iterNextNone(JSOBJ Py_UNUSED(_obj), - JSONTypeContext *Py_UNUSED(tc)) { - return 0; -} - -static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { - PyArrayObject *obj = - (PyArrayObject *)(GET_TC(tc)->newObj ? GET_TC(tc)->newObj : _obj); - - NpyArrContext *npyarr = PyObject_Malloc(sizeof(NpyArrContext)); - GET_TC(tc)->npyarr = npyarr; - - if (!npyarr) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - - npyarr->array = (PyObject *)obj; - npyarr->dataptr = PyArray_DATA(obj); - npyarr->ndim = PyArray_NDIM(obj) - 1; - npyarr->curdim = 0; - npyarr->type_num = PyArray_DESCR(obj)->type_num; - - if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, (int)npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, (int)npyarr->ndim); - npyarr->stridedim = npyarr->ndim; - npyarr->index[npyarr->ndim] = 0; - npyarr->inc = -1; - } else { - npyarr->dim = PyArray_DIM(obj, 0); - npyarr->stride = PyArray_STRIDE(obj, 0); - npyarr->stridedim = 0; - npyarr->index[0] = 0; - npyarr->inc = 1; - } - - npyarr->columnLabels = GET_TC(tc)->columnLabels; - npyarr->rowLabels = GET_TC(tc)->rowLabels; -} - -static void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; - - if (npyarr) { - NpyArr_freeItemValue(obj, tc); - PyObject_Free(npyarr); - } -} - -static void NpyArrPassThru_iterBegin(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc)) {} - -static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; - // finished this dimension, reset the data pointer - npyarr->curdim--; - npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; - npyarr->stridedim -= npyarr->inc; - - if (!PyArray_Check(npyarr->array)) { - PyErr_SetString(PyExc_TypeError, - "NpyArrayPassThru_iterEnd received a non-array object"); - return; - } - const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; - npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); - npyarr->dataptr += npyarr->stride; - - NpyArr_freeItemValue(obj, tc); -} - -static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; - - if (PyErr_Occurred()) { - return 0; - } - - if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { - return 0; - } - - NpyArr_freeItemValue(obj, tc); - - if (!PyArray_Check(npyarr->array)) { - PyErr_SetString(PyExc_TypeError, - "NpyArr_iterNextItem received a non-array object"); - return 0; - } - PyArrayObject *arrayobj = (PyArrayObject *)npyarr->array; - - if (PyArray_ISDATETIME(arrayobj)) { - GET_TC(tc)->itemValue = obj; - Py_INCREF(obj); - ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(arrayobj); - // Also write the resolution (unit) of the ndarray - PyArray_Descr *dtype = PyArray_DESCR(arrayobj); - ((PyObjectEncoder *)tc->encoder)->valueUnit = - get_datetime_metadata_from_dtype(dtype).base; - ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; - } else { - GET_TC(tc)->itemValue = PyArray_GETITEM(arrayobj, npyarr->dataptr); - } - - npyarr->dataptr += npyarr->stride; - npyarr->index[npyarr->stridedim]++; - return 1; -} - -static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; - - if (PyErr_Occurred()) { - return 0; - } - - if (npyarr->curdim >= npyarr->ndim || - npyarr->index[npyarr->stridedim] >= npyarr->dim) { - // innermost dimension, start retrieving item values - GET_TC(tc)->iterNext = NpyArr_iterNextItem; - return NpyArr_iterNextItem(_obj, tc); - } - - // dig a dimension deeper - npyarr->index[npyarr->stridedim]++; - - npyarr->curdim++; - npyarr->stridedim += npyarr->inc; - if (!PyArray_Check(npyarr->array)) { - PyErr_SetString(PyExc_TypeError, - "NpyArr_iterNext received a non-array object"); - return 0; - } - const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; - - npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); - npyarr->index[npyarr->stridedim] = 0; - - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; - GET_TC(tc)->itemValue = npyarr->array; - return 1; -} - -static JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; -} - -static const char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; - const char *cStr; - - if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { - const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; - cStr = npyarr->columnLabels[idx]; - } else { - const npy_intp idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; - cStr = npyarr->rowLabels[idx]; - } - - *outLen = strlen(cStr); - - return cStr; -} - -//============================================================================= -// Pandas block iteration functions -// -// Serialises a DataFrame column by column to avoid unnecessary data copies and -// more representative serialisation when dealing with mixed dtypes. -// -// Uses a dedicated NpyArrContext for each column. -//============================================================================= - -static void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - - if (blkCtxt->transpose) { - blkCtxt->colIdx++; - } else { - blkCtxt->colIdx = 0; - } - - NpyArr_freeItemValue(obj, tc); -} - -static int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - - if (blkCtxt->colIdx >= blkCtxt->ncols) { - return 0; - } - - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - blkCtxt->colIdx++; - return NpyArr_iterNextItem(obj, tc); -} - -static const char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *tc, size_t *outLen) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; - const char *cStr; - - if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { - const npy_intp idx = blkCtxt->colIdx - 1; - cStr = npyarr->columnLabels[idx]; - } else { - const npy_intp idx = - GET_TC(tc)->iterNext != PdBlock_iterNext - ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 - : npyarr->index[npyarr->stridedim]; - - cStr = npyarr->rowLabels[idx]; - } - - *outLen = strlen(cStr); - return cStr; -} - -static const char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), - JSONTypeContext *tc, - size_t *outLen) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - const char *cStr; - - if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { - const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; - cStr = npyarr->columnLabels[idx]; - } else { - const npy_intp idx = blkCtxt->colIdx; - cStr = npyarr->rowLabels[idx]; - } - - *outLen = strlen(cStr); - return cStr; -} - -static int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - - if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { - return 0; - } - - if (blkCtxt->transpose) { - if (blkCtxt->colIdx >= blkCtxt->ncols) { - return 0; - } - } else { - const NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; - if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { - return 0; - } - } - - ((PyObjectEncoder *)tc->encoder)->blkCtxtPassthru = blkCtxt; - GET_TC(tc)->itemValue = obj; - - return 1; -} - -static void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), - JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - - if (blkCtxt->transpose) { - // if transposed we exhaust each column before moving to the next - GET_TC(tc)->iterNext = NpyArr_iterNextItem; - GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose; - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - } -} - -static void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj = (PyObject *)_obj; - - GET_TC(tc)->iterGetName = GET_TC(tc)->transpose - ? PdBlock_iterGetName_Transpose - : PdBlock_iterGetName; - - PdBlockContext *blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); - if (!blkCtxt) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - GET_TC(tc)->pdblock = blkCtxt; - - blkCtxt->colIdx = 0; - blkCtxt->transpose = GET_TC(tc)->transpose; - blkCtxt->ncols = get_attr_length(obj, "columns"); - - if (blkCtxt->ncols == 0) { - blkCtxt->npyCtxts = NULL; - - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - - blkCtxt->npyCtxts = PyObject_Malloc(sizeof(NpyArrContext *) * blkCtxt->ncols); - if (!blkCtxt->npyCtxts) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - - PyObject *arrays = get_sub_attr(obj, "_mgr", "column_arrays"); - if (!arrays) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - - for (Py_ssize_t i = 0; i < PyObject_Length(arrays); i++) { - PyObject *array = PyList_GET_ITEM(arrays, i); - if (!array) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto ARR_RET; - } - - // ensure we have a numpy array (i.e. np.asarray) - PyObject *values = PyObject_CallMethod(array, "__array__", NULL); - if ((!values) || (!PyArray_CheckExact(values))) { - // Didn't get a numpy array - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto ARR_RET; - } - - GET_TC(tc)->newObj = values; - - // init a dedicated context for this column - NpyArr_iterBegin(obj, tc); - - GET_TC(tc)->itemValue = NULL; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; - - blkCtxt->npyCtxts[i] = GET_TC(tc)->npyarr; - GET_TC(tc)->newObj = NULL; - } - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; - goto ARR_RET; - -ARR_RET: - Py_DECREF(arrays); -} - -static void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->itemValue = NULL; - NpyArrContext *npyarr = GET_TC(tc)->npyarr; - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - - if (blkCtxt) { - for (int i = 0; i < blkCtxt->ncols; i++) { - npyarr = blkCtxt->npyCtxts[i]; - if (npyarr) { - if (npyarr->array) { - Py_DECREF(npyarr->array); - npyarr->array = NULL; - } - - GET_TC(tc)->npyarr = npyarr; - NpyArr_iterEnd(obj, tc); - - blkCtxt->npyCtxts[i] = NULL; - } - } - - if (blkCtxt->npyCtxts) { - PyObject_Free(blkCtxt->npyCtxts); - } - PyObject_Free(blkCtxt); - } -} - -//============================================================================= -// Tuple iteration functions -// itemValue is borrowed reference, no ref counting -//============================================================================= -static void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->index = 0; - GET_TC(tc)->size = PyTuple_GET_SIZE((PyObject *)obj); - GET_TC(tc)->itemValue = NULL; -} - -static int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) { - - if (GET_TC(tc)->index >= GET_TC(tc)->size) { - return 0; - } - - PyObject *item = PyTuple_GET_ITEM(obj, GET_TC(tc)->index); - - GET_TC(tc)->itemValue = item; - GET_TC(tc)->index++; - return 1; -} - -static void Tuple_iterEnd(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc)) {} - -static JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; -} - -static const char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { - return NULL; -} - -//============================================================================= -// Set iteration functions -// itemValue is borrowed reference, no ref counting -//============================================================================= -static void Set_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->itemValue = NULL; - GET_TC(tc)->iterator = PyObject_GetIter(obj); -} - -static int Set_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (GET_TC(tc)->itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } - - PyObject *item = PyIter_Next(GET_TC(tc)->iterator); - - if (item == NULL) { - return 0; - } - - GET_TC(tc)->itemValue = item; - return 1; -} - -static void Set_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (GET_TC(tc)->itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } - - if (GET_TC(tc)->iterator) { - Py_DECREF(GET_TC(tc)->iterator); - GET_TC(tc)->iterator = NULL; - } -} - -static JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; -} - -static const char *Set_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { - return NULL; -} - -//============================================================================= -// Dir iteration functions -// itemName ref is borrowed from PyObject_Dir (attrList). No refcount -// itemValue ref is from PyObject_GetAttr. Ref counted -//============================================================================= -static void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->attrList = PyObject_Dir(obj); - GET_TC(tc)->index = 0; - GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList); -} - -static void Dir_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (GET_TC(tc)->itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } - - if (GET_TC(tc)->itemName) { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = NULL; - } - - Py_DECREF((PyObject *)GET_TC(tc)->attrList); -} - -static int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj = (PyObject *)_obj; - PyObject *itemValue = GET_TC(tc)->itemValue; - PyObject *itemName = GET_TC(tc)->itemName; - - if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { - return 0; - } - - if (itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = itemValue = NULL; - } - - if (itemName) { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = itemName = NULL; - } - - for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index++) { - PyObject *attrName = - PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index); - PyObject *attr = PyUnicode_AsUTF8String(attrName); - const char *attrStr = PyBytes_AS_STRING(attr); - - if (attrStr[0] == '_') { - Py_DECREF(attr); - continue; - } - - itemValue = PyObject_GetAttr(obj, attrName); - if (itemValue == NULL) { - PyErr_Clear(); - Py_DECREF(attr); - continue; - } - - if (PyCallable_Check(itemValue)) { - Py_DECREF(itemValue); - Py_DECREF(attr); - continue; - } - - GET_TC(tc)->itemName = itemName; - GET_TC(tc)->itemValue = itemValue; - - itemName = attr; - break; - } - - if (itemName == NULL) { - GET_TC(tc)->index = GET_TC(tc)->size; - GET_TC(tc)->itemValue = NULL; - return 0; - } - - GET_TC(tc)->itemName = itemName; - GET_TC(tc)->itemValue = itemValue; - GET_TC(tc)->index++; - - return 1; -} - -static JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; -} - -static const char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); - return PyBytes_AS_STRING(GET_TC(tc)->itemName); -} - -//============================================================================= -// List iteration functions -// itemValue is borrowed from object (which is list). No refcounting -//============================================================================= -static void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->index = 0; - GET_TC(tc)->size = PyList_GET_SIZE((PyObject *)obj); -} - -static int List_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (GET_TC(tc)->index >= GET_TC(tc)->size) { - return 0; - } - - GET_TC(tc)->itemValue = PyList_GET_ITEM(obj, GET_TC(tc)->index); - GET_TC(tc)->index++; - return 1; -} - -static void List_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) { -} - -static JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; -} - -static const char *List_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { - return NULL; -} - -//============================================================================= -// pandas Index iteration functions -//============================================================================= -static void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - GET_TC(tc)->index = 0; -} - -static int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { - const Py_ssize_t index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) { - GET_TC(tc)->cStr = "name"; - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); - } else if (index == 1) { - GET_TC(tc)->cStr = "data"; - GET_TC(tc)->itemValue = get_values(obj); - if (!GET_TC(tc)->itemValue) { - return 0; - } - } else { - return 0; - } - - GET_TC(tc)->index++; - return 1; -} - -static void Index_iterEnd(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc)) {} - -static JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; -} - -static const char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; -} - -//============================================================================= -// pandas Series iteration functions -//============================================================================= -static void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - GET_TC(tc)->index = 0; - enc->outputFormat = VALUES; // for contained series -} - -static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { - const Py_ssize_t index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) { - GET_TC(tc)->cStr = "name"; - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); - } else if (index == 1) { - GET_TC(tc)->cStr = "index"; - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); - } else if (index == 2) { - GET_TC(tc)->cStr = "data"; - GET_TC(tc)->itemValue = get_values(obj); - if (!GET_TC(tc)->itemValue) { - return 0; - } - } else { - return 0; - } - - GET_TC(tc)->index++; - return 1; -} - -static void Series_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - enc->outputFormat = enc->originalOutputFormat; -} - -static JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; -} - -static const char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; -} - -//============================================================================= -// pandas DataFrame iteration functions -//============================================================================= -static void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - GET_TC(tc)->index = 0; - enc->outputFormat = VALUES; // for contained series & index -} - -static int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { - const Py_ssize_t index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) { - GET_TC(tc)->cStr = "columns"; - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); - } else if (index == 1) { - GET_TC(tc)->cStr = "index"; - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); - } else if (index == 2) { - GET_TC(tc)->cStr = "data"; - Py_INCREF(obj); - GET_TC(tc)->itemValue = obj; - } else { - return 0; - } - - GET_TC(tc)->index++; - return 1; -} - -static void DataFrame_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - enc->outputFormat = enc->originalOutputFormat; -} - -static JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; -} - -static const char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *tc, size_t *outLen) { - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; -} - -//============================================================================= -// Dict iteration functions -// itemName might converted to string (Python_Str). Do refCounting -// itemValue is borrowed from object (which is dict). No refCounting -//============================================================================= -static void Dict_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - GET_TC(tc)->index = 0; -} - -static int Dict_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (GET_TC(tc)->itemName) { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = NULL; - } - - if (!PyDict_Next((PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index, - &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) { - return 0; - } - - if (PyUnicode_Check(GET_TC(tc)->itemName)) { - GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName); - } else if (!PyBytes_Check(GET_TC(tc)->itemName)) { - GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName); - PyObject *itemNameTmp = GET_TC(tc)->itemName; - GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName); - Py_DECREF(itemNameTmp); - } else { - Py_INCREF(GET_TC(tc)->itemName); - } - return 1; -} - -static void Dict_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (GET_TC(tc)->itemName) { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = NULL; - } - Py_DECREF(GET_TC(tc)->dictObj); -} - -static JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; -} - -static const char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); - return PyBytes_AS_STRING(GET_TC(tc)->itemName); -} - -static void NpyArr_freeLabels(char **labels, npy_intp len) { - if (labels) { - for (npy_intp i = 0; i < len; i++) { - PyObject_Free(labels[i]); - } - PyObject_Free(labels); - } -} - -/* - * Function: NpyArr_encodeLabels - * ----------------------------- - * - * Builds an array of "encoded" labels. - * - * labels: PyArrayObject pointer for labels to be "encoded" - * num : number of labels - * - * "encode" is quoted above because we aren't really doing encoding - * For historical reasons this function would actually encode the entire - * array into a separate buffer with a separate call to JSON_Encode - * and would leave it to complex pointer manipulation from there to - * unpack values as needed. To make things simpler and more idiomatic - * this has instead just stringified any input save for datetime values, - * which may need to be represented in various formats. - */ -static char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, - npy_intp num) { - // NOTE this function steals a reference to labels. - PyObject *item = NULL; - const NPY_DATETIMEUNIT base = enc->datetimeUnit; - - if (!labels) { - return 0; - } - - if (PyArray_SIZE(labels) < num) { - PyErr_SetString(PyExc_ValueError, - "Label array sizes do not match corresponding data shape"); - Py_DECREF(labels); - return 0; - } - - char **ret = PyObject_Malloc(sizeof(char *) * num); - if (!ret) { - PyErr_NoMemory(); - Py_DECREF(labels); - return 0; - } - - for (npy_intp i = 0; i < num; i++) { - ret[i] = NULL; - } - - const npy_intp stride = PyArray_STRIDE(labels, 0); - char *dataptr = PyArray_DATA(labels); - const int type_num = PyArray_TYPE(labels); - PyArray_Descr *dtype = PyArray_DESCR(labels); - - for (npy_intp i = 0; i < num; i++) { - item = PyArray_GETITEM(labels, dataptr); - if (!item) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - int is_datetimelike = 0; - int64_t i8date; - NPY_DATETIMEUNIT dateUnit = NPY_FR_ns; - if (PyTypeNum_ISDATETIME(type_num)) { - is_datetimelike = 1; - i8date = *(int64_t *)dataptr; - dateUnit = get_datetime_metadata_from_dtype(dtype).base; - } else if (PyDate_Check(item) || PyDelta_Check(item)) { - is_datetimelike = 1; - if (PyObject_HasAttrString(item, "_value")) { - // pd.Timestamp object or pd.NaT - // see test_date_index_and_values for case with non-nano - i8date = get_long_attr(item, "_value"); - } else { - if (PyDelta_Check(item)) { - // TODO(anyone): cast below loses precision if total_seconds return - // value exceeds number of bits that significand can hold - // also liable to overflow - i8date = (int64_t)(total_seconds(item) * - 1000000000LL); // nanoseconds per second - } else { - // datetime.* objects don't follow above rules - i8date = PyDateTimeToEpoch(item, NPY_FR_ns); - } - } - } - - size_t len; - char *cLabel; - if (is_datetimelike) { - if (i8date == get_nat()) { - len = 4; - cLabel = PyObject_Malloc(len + 1); - strncpy(cLabel, "null", len + 1); - } else { - if (enc->datetimeIso) { - if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { - // TODO(username): non-nano timedelta support? - cLabel = int64ToIsoDuration(i8date, &len); - } else { - if (type_num == NPY_DATETIME) { - cLabel = int64ToIso(i8date, dateUnit, base, &len); - } else { - cLabel = PyDateTimeToIso(item, base, &len); - } - } - if (cLabel == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - } else { - int size_of_cLabel = 21; // 21 chars for int 64 - cLabel = PyObject_Malloc(size_of_cLabel); - if (scaleNanosecToUnit(&i8date, base) == -1) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - snprintf(cLabel, size_of_cLabel, "%" PRId64, i8date); - len = strlen(cLabel); - } - } - } else { // Fallback to string representation - // Replace item with the string to keep it alive. - Py_SETREF(item, PyObject_Str(item)); - if (item == NULL) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - cLabel = (char *)PyUnicode_AsUTF8(item); - len = strlen(cLabel); - } - - // Add 1 to include NULL terminator - ret[i] = PyObject_Malloc(len + 1); - memcpy(ret[i], cLabel, len + 1); - Py_DECREF(item); - - if (is_datetimelike) { - PyObject_Free(cLabel); - } - - if (PyErr_Occurred()) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - if (!ret[i]) { - PyErr_NoMemory(); - ret = 0; - break; - } - - dataptr += stride; - } - - Py_DECREF(labels); - return ret; -} - -static void Object_invokeDefaultHandler(PyObject *obj, PyObjectEncoder *enc) { - PyObject *tmpObj = NULL; - tmpObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL); - if (!PyErr_Occurred()) { - if (tmpObj == NULL) { - PyErr_SetString(PyExc_TypeError, "Failed to execute default handler"); - } else { - encode(tmpObj, (JSONObjectEncoder *)enc, NULL, 0); - } - } - Py_XDECREF(tmpObj); - return; -} - -static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { - tc->prv = NULL; - - if (!_obj) { - tc->type = JT_INVALID; - return; - } - - PyObject *obj = (PyObject *)_obj; - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - - if (PyBool_Check(obj)) { - tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; - return; - } else if (obj == Py_None) { - tc->type = JT_NULL; - return; - } - - TypeContext *pc = createTypeContext(); - if (!pc) { - tc->type = JT_INVALID; - return; - } - tc->prv = pc; - - if (PyTypeNum_ISDATETIME(enc->npyType)) { - int64_t longVal = *(npy_int64 *)enc->npyValue; - if (longVal == get_nat()) { - tc->type = JT_NULL; - } else { - if (enc->datetimeIso) { - if (enc->npyType == NPY_TIMEDELTA) { - pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; - } else { - pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; - } - // Currently no way to pass longVal to iso function, so use - // state management - pc->longValue = longVal; - tc->type = JT_UTF8; - } else { - NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - if (scaleNanosecToUnit(&longVal, base) == -1) { - goto INVALID; - } - pc->longValue = longVal; - tc->type = JT_LONG; - } - } - - // TODO(username): this prevents infinite loop with - // mixed-type DataFrames; - // refactor - enc->npyCtxtPassthru = NULL; - enc->npyType = -1; - return; - } - - if (PyIter_Check(obj) || (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) { - goto ISITERABLE; - } - - if (PyLong_Check(obj)) { - tc->type = JT_LONG; - int overflow = 0; - pc->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow); - int err; - err = (pc->longValue == -1) && PyErr_Occurred(); - - if (overflow) { - tc->type = JT_BIGNUM; - } else if (err) { - goto INVALID; - } - - return; - } else if (PyFloat_Check(obj)) { - const double val = PyFloat_AS_DOUBLE(obj); - if (npy_isnan(val) || npy_isinf(val)) { - tc->type = JT_NULL; - } else { - pc->doubleValue = val; - tc->type = JT_DOUBLE; - } - return; - } else if (PyBytes_Check(obj)) { - pc->PyTypeToUTF8 = PyBytesToUTF8; - tc->type = JT_UTF8; - return; - } else if (PyUnicode_Check(obj)) { - pc->PyTypeToUTF8 = PyUnicodeToUTF8; - tc->type = JT_UTF8; - return; - } else if (object_is_decimal_type(obj)) { - PyObject *is_nan_py = PyObject_RichCompare(obj, obj, Py_NE); - if (is_nan_py == NULL) { - goto INVALID; - } - int is_nan = (is_nan_py == Py_True); - Py_DECREF(is_nan_py); - if (is_nan) { - tc->type = JT_NULL; - return; - } - pc->PyTypeToUTF8 = PyDecimalToUTF8Callback; - tc->type = JT_UTF8; - return; - } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { - if (object_is_nat_type(obj)) { - tc->type = JT_NULL; - return; - } - - if (enc->datetimeIso) { - pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; - tc->type = JT_UTF8; - } else { - NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - pc->longValue = PyDateTimeToEpoch(obj, base); - tc->type = JT_LONG; - } - return; - } else if (PyTime_Check(obj)) { - pc->PyTypeToUTF8 = PyTimeToJSON; - tc->type = JT_UTF8; - return; - } else if (PyArray_IsScalar(obj, Datetime)) { - npy_int64 longVal; - if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) { - tc->type = JT_NULL; - return; - } - PyArray_Descr *dtype = PyArray_DescrFromScalar(obj); - if (!PyTypeNum_ISDATETIME(dtype->type_num)) { - PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime"); - return; - } - - PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64); - PyArray_CastScalarToCtype(obj, &longVal, outcode); - Py_DECREF(outcode); - - if (enc->datetimeIso) { - GET_TC(tc)->longValue = longVal; - pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; - enc->valueUnit = get_datetime_metadata_from_dtype(dtype).base; - tc->type = JT_UTF8; - } else { - NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - pc->longValue = PyDateTimeToEpoch(obj, base); - tc->type = JT_LONG; - } - return; - } else if (PyDelta_Check(obj)) { - // pd.Timedelta object or pd.NaT should evaluate true here - // fallback to nanoseconds per sec for other objects - // TODO(anyone): cast below loses precision if total_seconds return - // value exceeds number of bits that significand can hold - // also liable to overflow - int64_t value = PyObject_HasAttrString(obj, "_value") - ? get_long_attr(obj, "_value") - : (int64_t)(total_seconds(obj) * 1000000000LL); - - if (value == get_nat()) { - tc->type = JT_NULL; - return; - } else if (enc->datetimeIso) { - pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; - tc->type = JT_UTF8; - } else { - const int unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - if (scaleNanosecToUnit(&value, unit) != 0) { - // TODO(username): Add some kind of error handling here - } - - if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_OverflowError)) { - goto INVALID; - } - - tc->type = JT_LONG; - } - pc->longValue = value; - return; - } else if (PyArray_IsScalar(obj, Integer)) { - tc->type = JT_LONG; - PyArray_CastScalarToCtype(obj, &(pc->longValue), - PyArray_DescrFromType(NPY_INT64)); - - if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_OverflowError)) { - goto INVALID; - } - - return; - } else if (PyArray_IsScalar(obj, Bool)) { - PyArray_CastScalarToCtype(obj, &(pc->longValue), - PyArray_DescrFromType(NPY_BOOL)); - tc->type = (pc->longValue) ? JT_TRUE : JT_FALSE; - return; - } else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) { - PyArray_CastScalarToCtype(obj, &(pc->doubleValue), - PyArray_DescrFromType(NPY_DOUBLE)); - tc->type = JT_DOUBLE; - return; - } else if (PyArray_CheckScalar(obj)) { - PyErr_Format(PyExc_TypeError, - "%R (numpy-scalar) is not JSON serializable at the moment", - obj); - goto INVALID; - } else if (object_is_na_type(obj)) { - tc->type = JT_NULL; - return; - } - -ISITERABLE: - - if (object_is_index_type(obj)) { - if (enc->outputFormat == SPLIT) { - tc->type = JT_OBJECT; - pc->iterBegin = Index_iterBegin; - pc->iterEnd = Index_iterEnd; - pc->iterNext = Index_iterNext; - pc->iterGetValue = Index_iterGetValue; - pc->iterGetName = Index_iterGetName; - return; - } - - pc->newObj = get_values(obj); - if (pc->newObj) { - tc->type = JT_ARRAY; - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - } else { - goto INVALID; - } - - return; - } else if (object_is_series_type(obj)) { - if (enc->outputFormat == SPLIT) { - tc->type = JT_OBJECT; - pc->iterBegin = Series_iterBegin; - pc->iterEnd = Series_iterEnd; - pc->iterNext = Series_iterNext; - pc->iterGetValue = Series_iterGetValue; - pc->iterGetName = Series_iterGetName; - return; - } - - pc->newObj = get_values(obj); - if (!pc->newObj) { - goto INVALID; - } - - if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { - tc->type = JT_OBJECT; - PyObject *tmpObj = PyObject_GetAttrString(obj, "index"); - if (!tmpObj) { - goto INVALID; - } - PyObject *values = get_values(tmpObj); - Py_DECREF(tmpObj); - if (!values) { - goto INVALID; - } - - if (!PyArray_Check(pc->newObj)) { - PyErr_SetString(PyExc_TypeError, - "Object_beginTypeContext received a non-array object"); - goto INVALID; - } - const PyArrayObject *arrayobj = (const PyArrayObject *)pc->newObj; - pc->columnLabelsLen = PyArray_DIM(arrayobj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); - if (!pc->columnLabels) { - goto INVALID; - } - } else { - tc->type = JT_ARRAY; - } - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - return; - } else if (PyArray_Check(obj)) { - if (enc->npyCtxtPassthru) { - pc->npyarr = enc->npyCtxtPassthru; - tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY); - - pc->iterBegin = NpyArrPassThru_iterBegin; - pc->iterNext = NpyArr_iterNext; - pc->iterEnd = NpyArrPassThru_iterEnd; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - - enc->npyCtxtPassthru = NULL; - return; - } - - tc->type = JT_ARRAY; - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - return; - } else if (object_is_dataframe_type(obj)) { - if (enc->blkCtxtPassthru) { - pc->pdblock = enc->blkCtxtPassthru; - tc->type = - (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY); - - pc->iterBegin = PdBlockPassThru_iterBegin; - pc->iterEnd = PdBlockPassThru_iterEnd; - pc->iterNext = PdBlock_iterNextItem; - pc->iterGetName = PdBlock_iterGetName; - pc->iterGetValue = NpyArr_iterGetValue; - - enc->blkCtxtPassthru = NULL; - return; - } - - if (enc->outputFormat == SPLIT) { - tc->type = JT_OBJECT; - pc->iterBegin = DataFrame_iterBegin; - pc->iterEnd = DataFrame_iterEnd; - pc->iterNext = DataFrame_iterNext; - pc->iterGetValue = DataFrame_iterGetValue; - pc->iterGetName = DataFrame_iterGetName; - return; - } - - pc->iterBegin = PdBlock_iterBegin; - pc->iterEnd = PdBlock_iterEnd; - pc->iterNext = PdBlock_iterNext; - pc->iterGetName = PdBlock_iterGetName; - pc->iterGetValue = NpyArr_iterGetValue; - - if (enc->outputFormat == VALUES) { - tc->type = JT_ARRAY; - } else if (enc->outputFormat == RECORDS) { - tc->type = JT_ARRAY; - PyObject *tmpObj = PyObject_GetAttrString(obj, "columns"); - if (!tmpObj) { - goto INVALID; - } - PyObject *values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - goto INVALID; - } - pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); - Py_DECREF(tmpObj); - if (!pc->columnLabels) { - goto INVALID; - } - } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { - tc->type = JT_OBJECT; - PyObject *tmpObj = - (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "index") - : PyObject_GetAttrString(obj, "columns")); - if (!tmpObj) { - goto INVALID; - } - PyObject *values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - goto INVALID; - } - pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = - NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->rowLabelsLen); - Py_DECREF(tmpObj); - tmpObj = - (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") - : PyObject_GetAttrString(obj, "index")); - if (!tmpObj) { - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); - Py_DECREF(tmpObj); - if (!pc->columnLabels) { - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - - if (enc->outputFormat == COLUMNS) { - pc->transpose = 1; - } - } else { - goto INVALID; - } - return; - } else if (PyDict_Check(obj)) { - tc->type = JT_OBJECT; - pc->iterBegin = Dict_iterBegin; - pc->iterEnd = Dict_iterEnd; - pc->iterNext = Dict_iterNext; - pc->iterGetValue = Dict_iterGetValue; - pc->iterGetName = Dict_iterGetName; - pc->dictObj = obj; - Py_INCREF(obj); - - return; - } else if (PyList_Check(obj)) { - tc->type = JT_ARRAY; - pc->iterBegin = List_iterBegin; - pc->iterEnd = List_iterEnd; - pc->iterNext = List_iterNext; - pc->iterGetValue = List_iterGetValue; - pc->iterGetName = List_iterGetName; - return; - } else if (PyTuple_Check(obj)) { - tc->type = JT_ARRAY; - pc->iterBegin = Tuple_iterBegin; - pc->iterEnd = Tuple_iterEnd; - pc->iterNext = Tuple_iterNext; - pc->iterGetValue = Tuple_iterGetValue; - pc->iterGetName = Tuple_iterGetName; - return; - } else if (PyAnySet_Check(obj)) { - tc->type = JT_ARRAY; - pc->iterBegin = Set_iterBegin; - pc->iterEnd = Set_iterEnd; - pc->iterNext = Set_iterNext; - pc->iterGetValue = Set_iterGetValue; - pc->iterGetName = Set_iterGetName; - return; - } - - PyObject *toDictFunc = PyObject_GetAttrString(obj, "toDict"); - - if (toDictFunc) { - PyObject *tuple = PyTuple_New(0); - PyObject *toDictResult = PyObject_Call(toDictFunc, tuple, NULL); - Py_DECREF(tuple); - Py_DECREF(toDictFunc); - - if (toDictResult == NULL) { - PyErr_Clear(); - tc->type = JT_NULL; - return; - } - - if (!PyDict_Check(toDictResult)) { - Py_DECREF(toDictResult); - tc->type = JT_NULL; - return; - } - - tc->type = JT_OBJECT; - pc->iterBegin = Dict_iterBegin; - pc->iterEnd = Dict_iterEnd; - pc->iterNext = Dict_iterNext; - pc->iterGetValue = Dict_iterGetValue; - pc->iterGetName = Dict_iterGetName; - pc->dictObj = toDictResult; - return; - } - - PyErr_Clear(); - - if (enc->defaultHandler) { - Object_invokeDefaultHandler(obj, enc); - goto INVALID; - } - - tc->type = JT_OBJECT; - pc->iterBegin = Dir_iterBegin; - pc->iterEnd = Dir_iterEnd; - pc->iterNext = Dir_iterNext; - pc->iterGetValue = Dir_iterGetValue; - pc->iterGetName = Dir_iterGetName; - return; - -INVALID: - tc->type = JT_INVALID; - PyObject_Free(tc->prv); - tc->prv = NULL; - return; -} - -static void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (tc->prv) { - Py_XDECREF(GET_TC(tc)->newObj); - GET_TC(tc)->newObj = NULL; - NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); - GET_TC(tc)->rowLabels = NULL; - NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); - GET_TC(tc)->columnLabels = NULL; - GET_TC(tc)->cStr = NULL; - PyObject_Free(tc->prv); - tc->prv = NULL; - } -} - -static const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, - size_t *_outLen) { - return GET_TC(tc)->PyTypeToUTF8(obj, tc, _outLen); -} - -static JSINT64 Object_getLongValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->longValue; -} - -static double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->doubleValue; -} - -static const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, - size_t *_outLen) { - PyObject *repr = PyObject_Str(obj); - const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *)_outLen); - char *bytes = PyObject_Malloc(*_outLen + 1); - memcpy(bytes, str, *_outLen + 1); - GET_TC(tc)->cStr = bytes; - - Py_DECREF(repr); - - return GET_TC(tc)->cStr; -} - -static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); } - -static void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->iterBegin(obj, tc); -} - -static int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) { - return GET_TC(tc)->iterNext(obj, tc); -} - -static void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->iterEnd(obj, tc); -} - -static JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { - return GET_TC(tc)->iterGetValue(obj, tc); -} - -static const char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen) { - return GET_TC(tc)->iterGetName(obj, tc, outLen); -} - -PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, - PyObject *kwargs) { - PyDateTime_IMPORT; - if (PyDateTimeAPI == NULL) { - return NULL; - } - - PandasDateTime_IMPORT; - if (PandasDateTimeAPI == NULL) { - return NULL; - } - - static char *kwlist[] = {"obj", - "ensure_ascii", - "double_precision", - "encode_html_chars", - "orient", - "date_unit", - "iso_dates", - "default_handler", - "indent", - NULL}; - - PyObject *oinput = NULL; - PyObject *oensureAscii = NULL; - int idoublePrecision = 10; // default double precision setting - PyObject *oencodeHTMLChars = NULL; - char *sOrient = NULL; - char *sdateFormat = NULL; - PyObject *oisoDates = 0; - PyObject *odefHandler = 0; - int indent = 0; - - PyObjectEncoder pyEncoder = { - { - .beginTypeContext = Object_beginTypeContext, - .endTypeContext = Object_endTypeContext, - .getStringValue = Object_getStringValue, - .getLongValue = Object_getLongValue, - .getIntValue = NULL, - .getDoubleValue = Object_getDoubleValue, - .getBigNumStringValue = Object_getBigNumStringValue, - .iterBegin = Object_iterBegin, - .iterNext = Object_iterNext, - .iterEnd = Object_iterEnd, - .iterGetValue = Object_iterGetValue, - .iterGetName = Object_iterGetName, - .releaseObject = Object_releaseObject, - .malloc = PyObject_Malloc, - .realloc = PyObject_Realloc, - .free = PyObject_Free, - .recursionMax = -1, - .doublePrecision = idoublePrecision, - .forceASCII = 1, - .encodeHTMLChars = 0, - .indent = indent, - .errorMsg = NULL, - }, - .npyCtxtPassthru = NULL, - .blkCtxtPassthru = NULL, - .npyType = -1, - .npyValue = NULL, - .datetimeIso = 0, - .datetimeUnit = NPY_FR_ms, - .outputFormat = COLUMNS, - .defaultHandler = NULL, - }; - JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOOi", kwlist, &oinput, - &oensureAscii, &idoublePrecision, - &oencodeHTMLChars, &sOrient, &sdateFormat, - &oisoDates, &odefHandler, &indent)) { - return NULL; - } - - if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) { - encoder->forceASCII = 0; - } - - if (oencodeHTMLChars != NULL && PyObject_IsTrue(oencodeHTMLChars)) { - encoder->encodeHTMLChars = 1; - } - - if (idoublePrecision > JSON_DOUBLE_MAX_DECIMALS || idoublePrecision < 0) { - PyErr_Format( - PyExc_ValueError, - "Invalid value '%d' for option 'double_precision', max is '%u'", - idoublePrecision, JSON_DOUBLE_MAX_DECIMALS); - return NULL; - } - encoder->doublePrecision = idoublePrecision; - - if (sOrient != NULL) { - if (strcmp(sOrient, "records") == 0) { - pyEncoder.outputFormat = RECORDS; - } else if (strcmp(sOrient, "index") == 0) { - pyEncoder.outputFormat = INDEX; - } else if (strcmp(sOrient, "split") == 0) { - pyEncoder.outputFormat = SPLIT; - } else if (strcmp(sOrient, "values") == 0) { - pyEncoder.outputFormat = VALUES; - } else if (strcmp(sOrient, "columns") != 0) { - PyErr_Format(PyExc_ValueError, "Invalid value '%s' for option 'orient'", - sOrient); - return NULL; - } - } - - if (sdateFormat != NULL) { - if (strcmp(sdateFormat, "s") == 0) { - pyEncoder.datetimeUnit = NPY_FR_s; - } else if (strcmp(sdateFormat, "ms") == 0) { - pyEncoder.datetimeUnit = NPY_FR_ms; - } else if (strcmp(sdateFormat, "us") == 0) { - pyEncoder.datetimeUnit = NPY_FR_us; - } else if (strcmp(sdateFormat, "ns") == 0) { - pyEncoder.datetimeUnit = NPY_FR_ns; - } else { - PyErr_Format(PyExc_ValueError, - "Invalid value '%s' for option 'date_unit'", sdateFormat); - return NULL; - } - } - - if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) { - pyEncoder.datetimeIso = 1; - } - - if (odefHandler != NULL && odefHandler != Py_None) { - if (!PyCallable_Check(odefHandler)) { - PyErr_SetString(PyExc_TypeError, "Default handler is not callable"); - return NULL; - } - pyEncoder.defaultHandler = odefHandler; - } - - encoder->indent = indent; - - pyEncoder.originalOutputFormat = pyEncoder.outputFormat; - - char buffer[65536]; - char *ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer)); - if (PyErr_Occurred()) { - return NULL; - } - - if (encoder->errorMsg) { - if (ret != buffer) { - encoder->free(ret); - } - PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg); - return NULL; - } - - PyObject *newobj = PyUnicode_FromString(ret); - - if (ret != buffer) { - encoder->free(ret); - } - - return newobj; -} diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c deleted file mode 100644 index 2ee084b9304f4..0000000000000 --- a/pandas/_libs/src/vendored/ujson/python/ujson.c +++ /dev/null @@ -1,452 +0,0 @@ -/* -Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: -* Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. -* Neither the name of the ESN Social Software AB nor the -names of its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE -GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF -THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights -reserved. - -Numeric decoder derived from TCL library -https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms -* Copyright (c) 1988-1993 The Regents of the University of California. -* Copyright (c) 1994 Sun Microsystems, Inc. -*/ - -// Licence at LICENSES/ULTRAJSON_LICENSE - -#define PY_SSIZE_T_CLEAN -#include - -#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY -#include "numpy/arrayobject.h" - -/* objToJSON */ -PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs); -void *initObjToJSON(void); - -/* JSONToObj */ -PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs); - -#define ENCODER_HELP_TEXT \ - "Use ensure_ascii=false to output UTF-8. Pass in double_precision to " \ - "alter the maximum digit precision of doubles. Set " \ - "encode_html_chars=True to encode < > & as unicode escape sequences." - -static PyMethodDef ujsonMethods[] = { - {"ujson_dumps", (PyCFunction)(void (*)(void))objToJSON, - METH_VARARGS | METH_KEYWORDS, - "Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT}, - {"ujson_loads", (PyCFunction)(void (*)(void))JSONToObj, - METH_VARARGS | METH_KEYWORDS, - "Converts JSON as string to dict object structure. Use precise_float=True " - "to use high precision float decoder."}, - {NULL, NULL, 0, NULL} /* Sentinel */ -}; - -typedef struct { - PyObject *type_decimal; - PyObject *type_dataframe; - PyObject *type_series; - PyObject *type_index; - PyObject *type_nat; - PyObject *type_na; -} modulestate; - -#define modulestate(o) ((modulestate *)PyModule_GetState(o)) - -static int module_traverse(PyObject *m, visitproc visit, void *arg); -static int module_clear(PyObject *m); -static void module_free(void *module); - -static struct PyModuleDef moduledef = {.m_base = PyModuleDef_HEAD_INIT, - .m_name = "pandas._libs.json", - .m_methods = ujsonMethods, - .m_size = sizeof(modulestate), - .m_traverse = module_traverse, - .m_clear = module_clear, - .m_free = module_free}; - -#ifndef PYPY_VERSION -/* Used in objToJSON.c */ -int object_is_decimal_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_decimal = state->type_decimal; - if (type_decimal == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_decimal); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; -} - -int object_is_dataframe_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_dataframe = state->type_dataframe; - if (type_dataframe == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_dataframe); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; -} - -int object_is_series_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_series = state->type_series; - if (type_series == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_series); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; -} - -int object_is_index_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_index = state->type_index; - if (type_index == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_index); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; -} - -int object_is_nat_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_nat = state->type_nat; - if (type_nat == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_nat); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; -} - -int object_is_na_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_na = state->type_na; - if (type_na == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_na); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; -} -#else -/* Used in objToJSON.c */ -int object_is_decimal_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("decimal"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_decimal = PyObject_GetAttrString(module, "Decimal"); - if (type_decimal == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_decimal); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_decimal); - PyErr_Clear(); - return 0; - } - return result; -} - -int object_is_dataframe_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_dataframe = PyObject_GetAttrString(module, "DataFrame"); - if (type_dataframe == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_dataframe); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_dataframe); - PyErr_Clear(); - return 0; - } - return result; -} - -int object_is_series_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_series = PyObject_GetAttrString(module, "Series"); - if (type_series == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_series); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_series); - PyErr_Clear(); - return 0; - } - return result; -} - -int object_is_index_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_index = PyObject_GetAttrString(module, "Index"); - if (type_index == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_index); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_index); - PyErr_Clear(); - return 0; - } - return result; -} - -int object_is_nat_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas._libs.tslibs.nattype"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_nat = PyObject_GetAttrString(module, "NaTType"); - if (type_nat == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_nat); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_nat); - PyErr_Clear(); - return 0; - } - return result; -} - -int object_is_na_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas._libs.missing"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_na = PyObject_GetAttrString(module, "NAType"); - if (type_na == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_na); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_na); - PyErr_Clear(); - return 0; - } - return result; -} - -#endif - -static int module_traverse(PyObject *m, visitproc visit, void *arg) { - Py_VISIT(modulestate(m)->type_decimal); - Py_VISIT(modulestate(m)->type_dataframe); - Py_VISIT(modulestate(m)->type_series); - Py_VISIT(modulestate(m)->type_index); - Py_VISIT(modulestate(m)->type_nat); - Py_VISIT(modulestate(m)->type_na); - return 0; -} - -static int module_clear(PyObject *m) { - Py_CLEAR(modulestate(m)->type_decimal); - Py_CLEAR(modulestate(m)->type_dataframe); - Py_CLEAR(modulestate(m)->type_series); - Py_CLEAR(modulestate(m)->type_index); - Py_CLEAR(modulestate(m)->type_nat); - Py_CLEAR(modulestate(m)->type_na); - return 0; -} - -static void module_free(void *module) { module_clear((PyObject *)module); } - -PyMODINIT_FUNC PyInit_json(void) { - import_array() PyObject *module; - -#ifndef PYPY_VERSION - // This function is not supported in PyPy. - if ((module = PyState_FindModule(&moduledef)) != NULL) { - Py_INCREF(module); - return module; - } -#endif - - module = PyModule_Create(&moduledef); - if (module == NULL) { - return NULL; - } - -#ifdef Py_GIL_DISABLED - PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED); -#endif - -#ifndef PYPY_VERSION - PyObject *mod_decimal = PyImport_ImportModule("decimal"); - if (mod_decimal) { - PyObject *type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal"); - assert(type_decimal != NULL); - modulestate(module)->type_decimal = type_decimal; - Py_DECREF(mod_decimal); - } - - PyObject *mod_pandas = PyImport_ImportModule("pandas"); - if (mod_pandas) { - PyObject *type_dataframe = PyObject_GetAttrString(mod_pandas, "DataFrame"); - assert(type_dataframe != NULL); - modulestate(module)->type_dataframe = type_dataframe; - - PyObject *type_series = PyObject_GetAttrString(mod_pandas, "Series"); - assert(type_series != NULL); - modulestate(module)->type_series = type_series; - - PyObject *type_index = PyObject_GetAttrString(mod_pandas, "Index"); - assert(type_index != NULL); - modulestate(module)->type_index = type_index; - - Py_DECREF(mod_pandas); - } - - PyObject *mod_nattype = PyImport_ImportModule("pandas._libs.tslibs.nattype"); - if (mod_nattype) { - PyObject *type_nat = PyObject_GetAttrString(mod_nattype, "NaTType"); - assert(type_nat != NULL); - modulestate(module)->type_nat = type_nat; - - Py_DECREF(mod_nattype); - } - - PyObject *mod_natype = PyImport_ImportModule("pandas._libs.missing"); - if (mod_natype) { - PyObject *type_na = PyObject_GetAttrString(mod_natype, "NAType"); - assert(type_na != NULL); - modulestate(module)->type_na = type_na; - - Py_DECREF(mod_natype); - } else { - PyErr_Clear(); - } -#endif - - /* Not vendored for now - JSONDecodeError = PyErr_NewException("ujson.JSONDecodeError", - PyExc_ValueError, NULL); Py_XINCREF(JSONDecodeError); if - (PyModule_AddObject(module, "JSONDecodeError", JSONDecodeError) < 0) - { - Py_XDECREF(JSONDecodeError); - Py_CLEAR(JSONDecodeError); - Py_DECREF(module); - return NULL; - } - */ - - return module; -} diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index eb5c7739e5132..baf85649da210 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1642,6 +1642,8 @@ def kind(self) -> str: @cache_readonly def itemsize(self) -> int: """Return the number of bytes in this dtype""" + if hasattr(self.pyarrow_dtype, "bit_width"): + return self.pyarrow_dtype.bit_width // 8 return self.numpy_dtype.itemsize def construct_array_type(self) -> type_t[BaseMaskedArray]: @@ -2307,6 +2309,8 @@ def kind(self) -> str: @cache_readonly def itemsize(self) -> int: """Return the number of bytes in this dtype""" + if hasattr(self.pyarrow_dtype, "bit_width"): + return self.pyarrow_dtype.bit_width // 8 return self.numpy_dtype.itemsize def construct_array_type(self) -> type_t[ArrowExtensionArray]: diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 38c84e45c6fe8..687bad160d95f 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1256,3 +1256,37 @@ def test_categorical_nan_no_dtype_conversion(): expected = pd.DataFrame({"a": Categorical([1], [1]), "b": [1]}) df.loc[0, "a"] = np.array([1]) tm.assert_frame_equal(df, expected) + +import pyarrow as pa +from pandas.core.dtypes.dtypes import ArrowDtype + + +class TestArrowDtype: + @pytest.mark.parametrize( + "pa_dtype, expected_itemsize", + [ + (pytest.param(lambda: pa.date32(), 4, id="date32")), + (pytest.param(lambda: pa.date64(), 8, id="date64")), + (pytest.param(lambda: pa.time32('s'), 4, id="time32_s")), + (pytest.param(lambda: pa.time64('us'), 8, id="time64_us")), + (pytest.param(lambda: pa.int32(), 4, id="int32")), + (pytest.param(lambda: pa.int64(), 8, id="int64")), + ], + ) + def test_itemsize_with_bit_width(self, pa_dtype, expected_itemsize): + """Test that ArrowDtype.itemsize correctly uses bit_width when available.""" + pytest.importorskip("pyarrow", "12.0.1") + + dtype = ArrowDtype(pa_dtype()) + assert dtype.itemsize == expected_itemsize + + def test_itemsize_fallback_to_numpy(self): + """Test itemsize falls back to numpy_dtype when bit_width unavailable.""" + pytest.importorskip("pyarrow", "12.0.1") + import pyarrow as pa + from pandas.core.dtypes.dtypes import ArrowDtype + + dtype = ArrowDtype(pa.string()) + result = dtype.itemsize + assert isinstance(result, int) + assert result > 0 \ No newline at end of file