Skip to content

Commit

Permalink
MAINT: Use C++ for tokenizer unicode-kind templating
Browse files Browse the repository at this point in the history
  • Loading branch information
seberg committed Feb 11, 2022
1 parent f69ddd7 commit 6fd9670
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 42 deletions.
2 changes: 1 addition & 1 deletion numpy/core/setup.py
Expand Up @@ -964,7 +964,7 @@ def gl_if_msvc(build_cmd):
join('src', 'multiarray', 'textreading', 'rows.c'),
join('src', 'multiarray', 'textreading', 'stream_pyobject.c'),
join('src', 'multiarray', 'textreading', 'str_to_int.c'),
join('src', 'multiarray', 'textreading', 'tokenize.c.src'),
join('src', 'multiarray', 'textreading', 'tokenize.cpp'),
]

#######################################################################
Expand Down
8 changes: 8 additions & 0 deletions numpy/core/src/multiarray/textreading/growth.h
@@ -1,7 +1,15 @@
#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_
#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_

#ifdef __cplusplus
extern "C" {
#endif

NPY_NO_EXPORT npy_intp
grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize);

#ifdef __cplusplus
}
#endif

#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ */
8 changes: 8 additions & 0 deletions numpy/core/src/multiarray/textreading/parser_config.h
Expand Up @@ -4,6 +4,10 @@

#include <stdbool.h>

#ifdef __cplusplus
extern "C" {
#endif

typedef struct {
/*
* Field delimiter character.
Expand Down Expand Up @@ -58,4 +62,8 @@ typedef struct {
} parser_config;


#ifdef __cplusplus
}
#endif

#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ */
8 changes: 8 additions & 0 deletions numpy/core/src/multiarray/textreading/stream.h
Expand Up @@ -3,6 +3,10 @@

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

/*
* When getting the next line, we hope that the buffer provider can already
* give some information about the newlines, because for Python iterables
Expand Down Expand Up @@ -38,4 +42,8 @@ typedef struct _stream {
((s)->stream_nextbuf((s), start, end, kind))
#define stream_close(s) ((s)->stream_close((s)))

#ifdef __cplusplus
}
#endif

#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ */
@@ -1,11 +1,6 @@

#include <Python.h>

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>

#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#define _MULTIARRAYMODULE
#include "numpy/ndarraytypes.h"
Expand All @@ -15,7 +10,6 @@
#include "textreading/parser_config.h"
#include "textreading/growth.h"


/*
How parsing quoted fields works:
Expand Down Expand Up @@ -45,12 +39,10 @@
*/


/**begin repeat
* #type = Py_UCS1, Py_UCS2, Py_UCS4#
*/
template <typename UCS>
static NPY_INLINE int
copy_to_field_buffer_@type@(tokenizer_state *ts,
const @type@ *chunk_start, const @type@ *chunk_end)
copy_to_field_buffer(tokenizer_state *ts,
const UCS *chunk_start, const UCS *chunk_end)
{
npy_intp chunk_length = chunk_end - chunk_start;
npy_intp size = chunk_length + ts->field_buffer_pos + 2;
Expand All @@ -62,8 +54,8 @@ copy_to_field_buffer_@type@(tokenizer_state *ts,
"line too long to handle while reading file.");
return -1;
}
Py_UCS4 *grown = PyMem_Realloc(ts->field_buffer, alloc_size);
if (grown == NULL) {
Py_UCS4 *grown = (Py_UCS4 *)PyMem_Realloc(ts->field_buffer, alloc_size);
if (grown == nullptr) {
PyErr_NoMemory();
return -1;
}
Expand All @@ -79,7 +71,6 @@ copy_to_field_buffer_@type@(tokenizer_state *ts,
ts->field_buffer_pos += chunk_length;
return 0;
}
/**end repeat**/


static NPY_INLINE int
Expand All @@ -99,8 +90,8 @@ add_field(tokenizer_state *ts)
"too many columns found; cannot read file.");
return -1;
}
field_info *fields = PyMem_Realloc(ts->fields, alloc_size);
if (fields == NULL) {
field_info *fields = (field_info *)PyMem_Realloc(ts->fields, alloc_size);
if (fields == nullptr) {
PyErr_NoMemory();
return -1;
}
Expand All @@ -117,16 +108,13 @@ add_field(tokenizer_state *ts)
}


/**begin repeat
* #kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND, PyUnicode_4BYTE_KIND#
* #type = Py_UCS1, Py_UCS2, Py_UCS4#
*/
template <typename UCS>
static NPY_INLINE int
tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
tokenizer_core(tokenizer_state *ts, parser_config *const config)
{
@type@ *pos = (@type@ *)ts->pos;
@type@ *stop = (@type@ *)ts->end;
@type@ *chunk_start;
UCS *pos = (UCS *)ts->pos;
UCS *stop = (UCS *)ts->end;
UCS *chunk_start;

if (ts->state == TOKENIZE_CHECK_QUOTED) {
/* before we can check for quotes, strip leading whitespace */
Expand Down Expand Up @@ -174,7 +162,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
break;
}
}
if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
if (copy_to_field_buffer(ts, chunk_start, pos) < 0) {
return -1;
}
pos++;
Expand All @@ -201,7 +189,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
break;
}
}
if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
if (copy_to_field_buffer(ts, chunk_start, pos) < 0) {
return -1;
}
pos++;
Expand All @@ -215,7 +203,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
break;
}
}
if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
if (copy_to_field_buffer(ts, chunk_start, pos) < 0) {
return -1;
}
pos++;
Expand All @@ -224,7 +212,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
case TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE:
if (*pos == config->quote) {
/* Copy the quote character directly from the config: */
if (copy_to_field_buffer_Py_UCS4(ts,
if (copy_to_field_buffer(ts,
&config->quote, &config->quote+1) < 0) {
return -1;
}
Expand Down Expand Up @@ -271,7 +259,6 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
ts->pos = (char *)pos;
return 0;
}
/**end repeat**/


/*
Expand Down Expand Up @@ -308,7 +295,7 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
ts->field_buffer_pos = 0;
ts->num_fields = 0;

while (1) {
while (true) {
/*
* This loop adds new fields to the result (to make up a full row)
* until the row ends (typically a line end or the file end)
Expand Down Expand Up @@ -352,14 +339,14 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
}
int status;
if (ts->unicode_kind == PyUnicode_1BYTE_KIND) {
status = tokenizer_core_Py_UCS1(ts, config);
status = tokenizer_core<Py_UCS1>(ts, config);
}
else if (ts->unicode_kind == PyUnicode_2BYTE_KIND) {
status = tokenizer_core_Py_UCS2(ts, config);
status = tokenizer_core<Py_UCS2>(ts, config);
}
else {
assert(ts->unicode_kind == PyUnicode_4BYTE_KIND);
status = tokenizer_core_Py_UCS4(ts, config);
status = tokenizer_core<Py_UCS4>(ts, config);
}
if (status < 0) {
return -1;
Expand Down Expand Up @@ -408,11 +395,11 @@ NPY_NO_EXPORT void
tokenizer_clear(tokenizer_state *ts)
{
PyMem_FREE(ts->field_buffer);
ts->field_buffer = NULL;
ts->field_buffer = nullptr;
ts->field_buffer_length = 0;

PyMem_FREE(ts->fields);
ts->fields = NULL;
ts->fields = nullptr;
ts->fields_size = 0;
}

Expand All @@ -437,18 +424,18 @@ tokenizer_init(tokenizer_state *ts, parser_config *config)
ts->num_fields = 0;

ts->buf_state = 0;
ts->pos = NULL;
ts->end = NULL;
ts->pos = nullptr;
ts->end = nullptr;

ts->field_buffer = PyMem_Malloc(32 * sizeof(Py_UCS4));
if (ts->field_buffer == NULL) {
ts->field_buffer = (Py_UCS4 *)PyMem_Malloc(32 * sizeof(Py_UCS4));
if (ts->field_buffer == nullptr) {
PyErr_NoMemory();
return -1;
}
ts->field_buffer_length = 32;

ts->fields = PyMem_Malloc(4 * sizeof(*ts->fields));
if (ts->fields == NULL) {
ts->fields = (field_info *)PyMem_Malloc(4 * sizeof(*ts->fields));
if (ts->fields == nullptr) {
PyErr_NoMemory();
return -1;
}
Expand Down
8 changes: 8 additions & 0 deletions numpy/core/src/multiarray/textreading/tokenize.h
Expand Up @@ -8,6 +8,10 @@
#include "textreading/stream.h"
#include "textreading/parser_config.h"

#ifdef __cplusplus
extern "C" {
#endif


typedef enum {
/* Initialization of fields */
Expand Down Expand Up @@ -75,4 +79,8 @@ tokenizer_init(tokenizer_state *ts, parser_config *config);
NPY_NO_EXPORT int
tokenize(stream *s, tokenizer_state *ts, parser_config *const config);

#ifdef __cplusplus
}
#endif

#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ */

0 comments on commit 6fd9670

Please sign in to comment.