From 39487e5a4790dafe2913460d8d587f935fd53d82 Mon Sep 17 00:00:00 2001 From: "Christoph M. Becker" Date: Sun, 2 Dec 2018 16:28:18 +0100 Subject: [PATCH] Allow empty $escape to eschew escaping CSV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Albeit CSV is still a widespread data exchange format, it has never been officially standardized. There exists, however, the “informational” RFC 4180[1] which has no notion of escape characters, but rather defines `escaped` as strings enclosed in double-quotes where contained double-quotes have to be doubled. While this concept is supported by PHP's implementation (`$enclosure`), the `$escape` sometimes interferes, so that `fgetcsv()` is unable to correctly parse externally generated CSV, and `fputcsv()` is sometimes generating non-compliant CSV. Since PHP's `$escape` concept is availble for many years, we cannot drop it for BC reasons (even though many consider it as bug). Instead we allow to pass an empty string as `$escape` parameter to the respective functions, which results in ignoring/omitting any escaping, and as such is more inline with RFC 4180. It is noteworthy that this is almost no userland BC break, since formerly most functions did not accept an empty string, and failed in this case. The only exception was `str_getcsv()` which did accept an empty string, and used a backslash as escape character then (which appears to be unintended behavior, anyway). The changed functions are `fputcsv()`, `fgetcsv()` and `str_getcsv()`, and also the `::setCsvControl()`, `::getCsvControl()`, `::fputcsv()`, and `::fgetcsv()` methods of `SplFileObject`. The implementation also changes the type of the escape parameter of the PHP_APIs `php_fgetcsv()` and `php_fputcsv()` from `char` to `int`, where `PHP_CSV_NO_ESCAPE` means to ignore/omit escaping. The parameter accepts the same values as `isalpha()` and friends, i.e. “the value of which shall be representable as an `unsigned char` or shall equal the value of the macro `EOF`. If the argument has any other value, the behavior is undefined.” This is a subtle BC break, since the character `chr(128)` has the value `-1` if `char` is signed, and so likely would be confused with `EOF` when converted to `int`. We consider this BC break to be acceptable, since it's rather unlikely that anybody uses `chr(128)` as escape character, and it easily can be fixed by casting all `escape` arguments to `unsigned char`. This patch implements the feature requests 38301[2] and 51496[3]. [1] [2] [3] --- ext/spl/spl_directory.c | 59 +++++++++++++------ ext/spl/spl_directory.h | 2 +- .../SplFileObject_fgetcsv_escape_empty.phpt | 31 ++++++++++ .../SplFileObject_fgetcsv_escape_error.phpt | 2 +- .../SplFileObject_fputcsv_variation15.phpt | 22 +++++++ .../SplFileObject_setCsvControl_error003.phpt | 2 +- ...FileObject_setCsvControl_variation002.phpt | 19 ++++++ ext/standard/file.c | 49 ++++++++------- ext/standard/file.h | 6 +- ext/standard/string.c | 7 ++- .../tests/file/fgetcsv_variation32.phpt | 32 ++++++++++ .../tests/file/fputcsv_variation16.phpt | 21 +++++++ .../tests/strings/str_getcsv_002.phpt | 19 ++++++ 13 files changed, 224 insertions(+), 47 deletions(-) create mode 100644 ext/spl/tests/SplFileObject_fgetcsv_escape_empty.phpt create mode 100644 ext/spl/tests/SplFileObject_fputcsv_variation15.phpt create mode 100644 ext/spl/tests/SplFileObject_setCsvControl_variation002.phpt create mode 100644 ext/standard/tests/file/fgetcsv_variation32.phpt create mode 100644 ext/standard/tests/file/fputcsv_variation16.phpt create mode 100644 ext/standard/tests/strings/str_getcsv_002.phpt diff --git a/ext/spl/spl_directory.c b/ext/spl/spl_directory.c index 73b7be3618886..bd941b9190dfb 100644 --- a/ext/spl/spl_directory.c +++ b/ext/spl/spl_directory.c @@ -323,7 +323,7 @@ static int spl_filesystem_file_open(spl_filesystem_object *intern, int use_inclu intern->u.file.delimiter = ','; intern->u.file.enclosure = '"'; - intern->u.file.escape = '\\'; + intern->u.file.escape = (unsigned char) '\\'; intern->u.file.func_getCurr = zend_hash_str_find_ptr(&intern->std.ce->function_table, "getcurrentline", sizeof("getcurrentline") - 1); @@ -2109,7 +2109,7 @@ static int spl_filesystem_file_call(spl_filesystem_object *intern, zend_function spl_filesystem_file_call(intern, func_ptr, pass_num_args, return_value, arg2); \ } /* }}} */ -static int spl_filesystem_file_read_csv(spl_filesystem_object *intern, char delimiter, char enclosure, char escape, zval *return_value) /* {{{ */ +static int spl_filesystem_file_read_csv(spl_filesystem_object *intern, char delimiter, char enclosure, int escape, zval *return_value) /* {{{ */ { int ret = SUCCESS; zval *value; @@ -2562,7 +2562,8 @@ SPL_METHOD(SplFileObject, func_name) \ SPL_METHOD(SplFileObject, fgetcsv) { spl_filesystem_object *intern = Z_SPLFILESYSTEM_P(ZEND_THIS); - char delimiter = intern->u.file.delimiter, enclosure = intern->u.file.enclosure, escape = intern->u.file.escape; + char delimiter = intern->u.file.delimiter, enclosure = intern->u.file.enclosure; + int escape = intern->u.file.escape; char *delim = NULL, *enclo = NULL, *esc = NULL; size_t d_len = 0, e_len = 0, esc_len = 0; @@ -2576,11 +2577,15 @@ SPL_METHOD(SplFileObject, fgetcsv) switch(ZEND_NUM_ARGS()) { case 3: - if (esc_len != 1) { - php_error_docref(NULL, E_WARNING, "escape must be a character"); + if (esc_len > 1) { + php_error_docref(NULL, E_WARNING, "escape must be empty or a single character"); RETURN_FALSE; } - escape = esc[0]; + if (esc_len == 0) { + escape = PHP_CSV_NO_ESCAPE; + } else { + escape = (unsigned char) esc[0]; + } /* no break */ case 2: if (e_len != 1) { @@ -2609,7 +2614,8 @@ SPL_METHOD(SplFileObject, fgetcsv) SPL_METHOD(SplFileObject, fputcsv) { spl_filesystem_object *intern = Z_SPLFILESYSTEM_P(ZEND_THIS); - char delimiter = intern->u.file.delimiter, enclosure = intern->u.file.enclosure, escape = intern->u.file.escape; + char delimiter = intern->u.file.delimiter, enclosure = intern->u.file.enclosure; + int escape = intern->u.file.escape; char *delim = NULL, *enclo = NULL, *esc = NULL; size_t d_len = 0, e_len = 0, esc_len = 0; zend_long ret; @@ -2619,11 +2625,17 @@ SPL_METHOD(SplFileObject, fputcsv) switch(ZEND_NUM_ARGS()) { case 4: - if (esc_len != 1) { - php_error_docref(NULL, E_WARNING, "escape must be a character"); - RETURN_FALSE; + switch (esc_len) { + case 0: + escape = PHP_CSV_NO_ESCAPE; + break; + case 1: + escape = (unsigned char) esc[0]; + break; + default: + php_error_docref(NULL, E_WARNING, "escape must be empty or a single character"); + RETURN_FALSE; } - escape = esc[0]; /* no break */ case 3: if (e_len != 1) { @@ -2654,7 +2666,8 @@ SPL_METHOD(SplFileObject, fputcsv) SPL_METHOD(SplFileObject, setCsvControl) { spl_filesystem_object *intern = Z_SPLFILESYSTEM_P(ZEND_THIS); - char delimiter = ',', enclosure = '"', escape='\\'; + char delimiter = ',', enclosure = '"'; + int escape = (unsigned char) '\\'; char *delim = NULL, *enclo = NULL, *esc = NULL; size_t d_len = 0, e_len = 0, esc_len = 0; @@ -2662,11 +2675,17 @@ SPL_METHOD(SplFileObject, setCsvControl) switch(ZEND_NUM_ARGS()) { case 3: - if (esc_len != 1) { - php_error_docref(NULL, E_WARNING, "escape must be a character"); - RETURN_FALSE; + switch (esc_len) { + case 0: + escape = PHP_CSV_NO_ESCAPE; + break; + case 1: + escape = (unsigned char) esc[0]; + break; + default: + php_error_docref(NULL, E_WARNING, "escape must be empty or a single character"); + RETURN_FALSE; } - escape = esc[0]; /* no break */ case 2: if (e_len != 1) { @@ -2705,8 +2724,12 @@ SPL_METHOD(SplFileObject, getCsvControl) delimiter[1] = '\0'; enclosure[0] = intern->u.file.enclosure; enclosure[1] = '\0'; - escape[0] = intern->u.file.escape; - escape[1] = '\0'; + if (intern->u.file.escape == PHP_CSV_NO_ESCAPE) { + escape[0] = '\0'; + } else { + escape[0] = (unsigned char) intern->u.file.escape; + escape[1] = '\0'; + } add_next_index_string(return_value, delimiter); add_next_index_string(return_value, enclosure); diff --git a/ext/spl/spl_directory.h b/ext/spl/spl_directory.h index 2bbdadfe6d4f1..d4dcfcbc20899 100644 --- a/ext/spl/spl_directory.h +++ b/ext/spl/spl_directory.h @@ -96,7 +96,7 @@ struct _spl_filesystem_object { zend_function *func_getCurr; char delimiter; char enclosure; - char escape; + int escape; } file; } u; zend_object std; diff --git a/ext/spl/tests/SplFileObject_fgetcsv_escape_empty.phpt b/ext/spl/tests/SplFileObject_fgetcsv_escape_empty.phpt new file mode 100644 index 0000000000000..cbc539c3234a6 --- /dev/null +++ b/ext/spl/tests/SplFileObject_fgetcsv_escape_empty.phpt @@ -0,0 +1,31 @@ +--TEST-- +SplFileObject::fgetcsv() with empty $escape +--FILE-- +fwrite($contents); +$file->rewind(); +while (($data = $file->fgetcsv(',', '"', ''))) { + print_r($data); +} +?> +===DONE=== +--EXPECT-- +Array +( + [0] => cell1 + [1] => cell2\ + [2] => cell3 + [3] => cell4 +) +Array +( + [0] => \\\line1 +line2\\\ +) +===DONE=== diff --git a/ext/spl/tests/SplFileObject_fgetcsv_escape_error.phpt b/ext/spl/tests/SplFileObject_fgetcsv_escape_error.phpt index b49bcdd13c345..4873341e5a2b9 100644 --- a/ext/spl/tests/SplFileObject_fgetcsv_escape_error.phpt +++ b/ext/spl/tests/SplFileObject_fgetcsv_escape_error.phpt @@ -14,5 +14,5 @@ var_dump($fo->fgetcsv(',', '"', 'invalid')); unlink('SplFileObject__fgetcsv8.csv'); ?> --EXPECTF-- -Warning: SplFileObject::fgetcsv(): escape must be a character in %s on line %d +Warning: SplFileObject::fgetcsv(): escape must be empty or a single character in %s on line %d bool(false) diff --git a/ext/spl/tests/SplFileObject_fputcsv_variation15.phpt b/ext/spl/tests/SplFileObject_fputcsv_variation15.phpt new file mode 100644 index 0000000000000..1cde292e28dbd --- /dev/null +++ b/ext/spl/tests/SplFileObject_fputcsv_variation15.phpt @@ -0,0 +1,22 @@ +--TEST-- +SplFileObject::fputcsv() with empty $escape +--FILE-- +fputcsv($record, ',', '"', ''); +} +$file->rewind(); +foreach ($file as $line) { + echo $line; +} +?> +===DONE=== +--EXPECT-- +\ +"\""" +===DONE=== diff --git a/ext/spl/tests/SplFileObject_setCsvControl_error003.phpt b/ext/spl/tests/SplFileObject_setCsvControl_error003.phpt index 927172a8dc818..071c54c1f0cb8 100644 --- a/ext/spl/tests/SplFileObject_setCsvControl_error003.phpt +++ b/ext/spl/tests/SplFileObject_setCsvControl_error003.phpt @@ -22,4 +22,4 @@ $s->setCsvControl('|', '\'', 'three'); unlink('csv_control_data.csv'); ?> --EXPECTF-- -Warning: SplFileObject::setCsvControl(): escape must be a character in %s on line %d +Warning: SplFileObject::setCsvControl(): escape must be empty or a single character in %s on line %d diff --git a/ext/spl/tests/SplFileObject_setCsvControl_variation002.phpt b/ext/spl/tests/SplFileObject_setCsvControl_variation002.phpt new file mode 100644 index 0000000000000..6d3a76ce945d9 --- /dev/null +++ b/ext/spl/tests/SplFileObject_setCsvControl_variation002.phpt @@ -0,0 +1,19 @@ +--TEST-- +SplFileObject::setCsvControl() and ::getCsvControl() with empty $escape +--FILE-- +setCsvControl(',', '"', ''); +var_dump($file->getCsvControl()); +?> +===DONE=== +--EXPECT-- +array(3) { + [0]=> + string(1) "," + [1]=> + string(1) """ + [2]=> + string(0) "" +} +===DONE=== diff --git a/ext/standard/file.c b/ext/standard/file.c index 2eed1562b9323..c962ba4824c6c 100644 --- a/ext/standard/file.c +++ b/ext/standard/file.c @@ -1857,9 +1857,9 @@ static const char *php_fgetcsv_lookup_trailing_spaces(const char *ptr, size_t le Format line as CSV and write to file pointer */ PHP_FUNCTION(fputcsv) { - char delimiter = ','; /* allow this to be set as parameter */ - char enclosure = '"'; /* allow this to be set as parameter */ - char escape_char = '\\'; /* allow this to be set as parameter */ + char delimiter = ','; /* allow this to be set as parameter */ + char enclosure = '"'; /* allow this to be set as parameter */ + int escape_char = (unsigned char) '\\'; /* allow this to be set as parameter */ php_stream *stream; zval *fp = NULL, *fields = NULL; size_t ret; @@ -1900,14 +1900,15 @@ PHP_FUNCTION(fputcsv) } if (escape_str != NULL) { + if (escape_str_len > 1) { + php_error_docref(NULL, E_NOTICE, "escape must be empty or a single character"); + } if (escape_str_len < 1) { - php_error_docref(NULL, E_WARNING, "escape must be a character"); - RETURN_FALSE; - } else if (escape_str_len > 1) { - php_error_docref(NULL, E_NOTICE, "escape must be a single character"); + escape_char = PHP_CSV_NO_ESCAPE; + } else { + /* use first character from string */ + escape_char = (unsigned char) *escape_str; } - /* use first character from string */ - escape_char = *escape_str; } PHP_STREAM_TO_ZVAL(stream, fp); @@ -1917,14 +1918,15 @@ PHP_FUNCTION(fputcsv) } /* }}} */ -/* {{{ PHPAPI size_t php_fputcsv(php_stream *stream, zval *fields, char delimiter, char enclosure, char escape_char) */ -PHPAPI size_t php_fputcsv(php_stream *stream, zval *fields, char delimiter, char enclosure, char escape_char) +/* {{{ PHPAPI size_t php_fputcsv(php_stream *stream, zval *fields, char delimiter, char enclosure, int escape_char) */ +PHPAPI size_t php_fputcsv(php_stream *stream, zval *fields, char delimiter, char enclosure, int escape_char) { int count, i = 0; size_t ret; zval *field_tmp; smart_str csvline = {0}; + ZEND_ASSERT((escape_char >= 0 && escape_char <= UCHAR_MAX) || escape_char == PHP_CSV_NO_ESCAPE); count = zend_hash_num_elements(Z_ARRVAL_P(fields)); ZEND_HASH_FOREACH_VAL(Z_ARRVAL_P(fields), field_tmp) { zend_string *tmp_field_str; @@ -1933,7 +1935,7 @@ PHPAPI size_t php_fputcsv(php_stream *stream, zval *fields, char delimiter, char /* enclose a field that contains a delimiter, an enclosure character, or a newline */ if (FPUTCSV_FLD_CHK(delimiter) || FPUTCSV_FLD_CHK(enclosure) || - FPUTCSV_FLD_CHK(escape_char) || + (escape_char != PHP_CSV_NO_ESCAPE && FPUTCSV_FLD_CHK(escape_char)) || FPUTCSV_FLD_CHK('\n') || FPUTCSV_FLD_CHK('\r') || FPUTCSV_FLD_CHK('\t') || @@ -1945,7 +1947,7 @@ PHPAPI size_t php_fputcsv(php_stream *stream, zval *fields, char delimiter, char smart_str_appendc(&csvline, enclosure); while (ch < end) { - if (*ch == escape_char) { + if (escape_char != PHP_CSV_NO_ESCAPE && *ch == escape_char) { escaped = 1; } else if (!escaped && *ch == enclosure) { smart_str_appendc(&csvline, enclosure); @@ -1983,7 +1985,7 @@ PHP_FUNCTION(fgetcsv) { char delimiter = ','; /* allow this to be set as parameter */ char enclosure = '"'; /* allow this to be set as parameter */ - char escape = '\\'; + int escape = (unsigned char) '\\'; /* first section exactly as php_fgetss */ @@ -2036,14 +2038,15 @@ PHP_FUNCTION(fgetcsv) } if (escape_str != NULL) { - if (escape_str_len < 1) { - php_error_docref(NULL, E_WARNING, "escape must be character"); - RETURN_FALSE; - } else if (escape_str_len > 1) { - php_error_docref(NULL, E_NOTICE, "escape must be a single character"); + if (escape_str_len > 1) { + php_error_docref(NULL, E_NOTICE, "escape must be empty or a single character"); } - escape = escape_str[0]; + if (escape_str_len < 1) { + escape = PHP_CSV_NO_ESCAPE; + } else { + escape = (unsigned char) escape_str[0]; + } } if (len_zv != NULL && Z_TYPE_P(len_zv) != IS_NULL) { @@ -2077,13 +2080,15 @@ PHP_FUNCTION(fgetcsv) } /* }}} */ -PHPAPI void php_fgetcsv(php_stream *stream, char delimiter, char enclosure, char escape_char, size_t buf_len, char *buf, zval *return_value) /* {{{ */ +PHPAPI void php_fgetcsv(php_stream *stream, char delimiter, char enclosure, int escape_char, size_t buf_len, char *buf, zval *return_value) /* {{{ */ { char *temp, *tptr, *bptr, *line_end, *limit; size_t temp_len, line_end_len; int inc_len; zend_bool first_field = 1; + ZEND_ASSERT((escape_char >= 0 && escape_char <= UCHAR_MAX) || escape_char == PHP_CSV_NO_ESCAPE); + /* initialize internal state */ php_mb_reset(); @@ -2227,7 +2232,7 @@ PHPAPI void php_fgetcsv(php_stream *stream, char delimiter, char enclosure, char default: if (*bptr == enclosure) { state = 2; - } else if (*bptr == escape_char) { + } else if (escape_char != PHP_CSV_NO_ESCAPE && *bptr == escape_char) { state = 1; } bptr++; diff --git a/ext/standard/file.h b/ext/standard/file.h index b4e564f1c3891..f5019d422bac8 100644 --- a/ext/standard/file.h +++ b/ext/standard/file.h @@ -77,8 +77,10 @@ PHPAPI int php_copy_file_ex(const char *src, const char *dest, int src_chk); PHPAPI int php_copy_file_ctx(const char *src, const char *dest, int src_chk, php_stream_context *ctx); PHPAPI int php_mkdir_ex(const char *dir, zend_long mode, int options); PHPAPI int php_mkdir(const char *dir, zend_long mode); -PHPAPI void php_fgetcsv(php_stream *stream, char delimiter, char enclosure, char escape_char, size_t buf_len, char *buf, zval *return_value); -PHPAPI size_t php_fputcsv(php_stream *stream, zval *fields, char delimiter, char enclosure, char escape_char); + +#define PHP_CSV_NO_ESCAPE EOF +PHPAPI void php_fgetcsv(php_stream *stream, char delimiter, char enclosure, int escape_char, size_t buf_len, char *buf, zval *return_value); +PHPAPI size_t php_fputcsv(php_stream *stream, zval *fields, char delimiter, char enclosure, int escape_char); #define META_DEF_BUFSIZE 8192 diff --git a/ext/standard/string.c b/ext/standard/string.c index 767bd2f519ea6..26144eb166de3 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -5365,7 +5365,8 @@ Parse a CSV string into an array */ PHP_FUNCTION(str_getcsv) { zend_string *str; - char delim = ',', enc = '"', esc = '\\'; + char delim = ',', enc = '"'; + int esc = (unsigned char) '\\'; char *delim_str = NULL, *enc_str = NULL, *esc_str = NULL; size_t delim_len = 0, enc_len = 0, esc_len = 0; @@ -5379,7 +5380,9 @@ PHP_FUNCTION(str_getcsv) delim = delim_len ? delim_str[0] : delim; enc = enc_len ? enc_str[0] : enc; - esc = esc_len ? esc_str[0] : esc; + if (esc_str != NULL) { + esc = esc_len ? (unsigned char) esc_str[0] : PHP_CSV_NO_ESCAPE; + } php_fgetcsv(NULL, delim, enc, esc, ZSTR_LEN(str), ZSTR_VAL(str), return_value); } diff --git a/ext/standard/tests/file/fgetcsv_variation32.phpt b/ext/standard/tests/file/fgetcsv_variation32.phpt new file mode 100644 index 0000000000000..eac1046c448e4 --- /dev/null +++ b/ext/standard/tests/file/fgetcsv_variation32.phpt @@ -0,0 +1,32 @@ +--TEST-- +fgetcsv() with empty $escape +--FILE-- + +===DONE=== +--EXPECT-- +Array +( + [0] => cell1 + [1] => cell2\ + [2] => cell3 + [3] => cell4 +) +Array +( + [0] => \\\line1 +line2\\\ +) +===DONE=== diff --git a/ext/standard/tests/file/fputcsv_variation16.phpt b/ext/standard/tests/file/fputcsv_variation16.phpt new file mode 100644 index 0000000000000..1bae60da20da4 --- /dev/null +++ b/ext/standard/tests/file/fputcsv_variation16.phpt @@ -0,0 +1,21 @@ +--TEST-- +fputcsv() with empty $escape +--FILE-- + +===DONE=== +--EXPECT-- +\ +"\""" +===DONE=== diff --git a/ext/standard/tests/strings/str_getcsv_002.phpt b/ext/standard/tests/strings/str_getcsv_002.phpt new file mode 100644 index 0000000000000..a4ff87c206583 --- /dev/null +++ b/ext/standard/tests/strings/str_getcsv_002.phpt @@ -0,0 +1,19 @@ +--TEST-- +str_getcsv() with empty $escape +--FILE-- + +===DONE=== +--EXPECT-- +Array +( + [0] => cell1 + [1] => cell2\ + [2] => cell3 + [3] => cell4 +) +===DONE===