From c3098ddd6205a7ec8183d33f5420fa0129c25d3d Mon Sep 17 00:00:00 2001 From: Jakub Zelenka Date: Sat, 17 Jun 2017 11:12:03 +0100 Subject: [PATCH 1/8] Add options for ignore or substituting invalid UTF8 in json_encode --- ext/json/json.c | 4 +++ ext/json/json_encoder.c | 22 +++++++++----- ext/json/php_json.h | 32 +++++++++++--------- ext/json/tests/json_encode_invalid_utf8.phpt | 23 ++++++++++++++ 4 files changed, 60 insertions(+), 21 deletions(-) create mode 100644 ext/json/tests/json_encode_invalid_utf8.phpt diff --git a/ext/json/json.c b/ext/json/json.c index 9c930ee1bc95c..e645f7c52bd8f 100644 --- a/ext/json/json.c +++ b/ext/json/json.c @@ -115,6 +115,10 @@ static PHP_MINIT_FUNCTION(json) PHP_JSON_REGISTER_CONSTANT("JSON_OBJECT_AS_ARRAY", PHP_JSON_OBJECT_AS_ARRAY); PHP_JSON_REGISTER_CONSTANT("JSON_BIGINT_AS_STRING", PHP_JSON_BIGINT_AS_STRING); + /* common options for json_decode and json_encode */ + PHP_JSON_REGISTER_CONSTANT("JSON_INVALID_UTF8_IGNORE", PHP_JSON_INVALID_UTF8_IGNORE); + PHP_JSON_REGISTER_CONSTANT("JSON_INVALID_UTF8_SUBSTITUTE", PHP_JSON_INVALID_UTF8_SUBSTITUTE); + /* json error constants */ PHP_JSON_REGISTER_CONSTANT("JSON_ERROR_NONE", PHP_JSON_ERROR_NONE); PHP_JSON_REGISTER_CONSTANT("JSON_ERROR_DEPTH", PHP_JSON_ERROR_DEPTH); diff --git a/ext/json/json_encoder.c b/ext/json/json_encoder.c index 9d480bcc90389..8e1e9566e174a 100644 --- a/ext/json/json_encoder.c +++ b/ext/json/json_encoder.c @@ -291,14 +291,22 @@ static int php_json_escape_string( /* check whether UTF8 character is correct */ if (status != SUCCESS) { - if (buf->s) { - ZSTR_LEN(buf->s) = checkpoint; - } - encoder->error_code = PHP_JSON_ERROR_UTF8; - if (options & PHP_JSON_PARTIAL_OUTPUT_ON_ERROR) { - smart_str_appendl(buf, "null", 4); + if (options & PHP_JSON_INVALID_UTF8_IGNORE) { + /* ignore invalid UTF8 character */ + continue; + } else if (options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) { + /* Use Unicode character 'REPLACEMENT CHARACTER' (U+FFFD) */ + us = 0xfffd; + } else { + if (buf->s) { + ZSTR_LEN(buf->s) = checkpoint; + } + encoder->error_code = PHP_JSON_ERROR_UTF8; + if (options & PHP_JSON_PARTIAL_OUTPUT_ON_ERROR) { + smart_str_appendl(buf, "null", 4); + } + return FAILURE; } - return FAILURE; } /* Escape U+2028/U+2029 line terminators, UNLESS both diff --git a/ext/json/php_json.h b/ext/json/php_json.h index 992f42c08742d..e772d8d0c2111 100644 --- a/ext/json/php_json.h +++ b/ext/json/php_json.h @@ -55,23 +55,27 @@ typedef enum { PHP_JSON_ERROR_UTF16 } php_json_error_code; +/* json_decode() options */ +#define PHP_JSON_OBJECT_AS_ARRAY (1<<0) +#define PHP_JSON_BIGINT_AS_STRING (1<<1) + /* json_encode() options */ -#define PHP_JSON_HEX_TAG (1<<0) -#define PHP_JSON_HEX_AMP (1<<1) -#define PHP_JSON_HEX_APOS (1<<2) -#define PHP_JSON_HEX_QUOT (1<<3) -#define PHP_JSON_FORCE_OBJECT (1<<4) -#define PHP_JSON_NUMERIC_CHECK (1<<5) -#define PHP_JSON_UNESCAPED_SLASHES (1<<6) -#define PHP_JSON_PRETTY_PRINT (1<<7) -#define PHP_JSON_UNESCAPED_UNICODE (1<<8) -#define PHP_JSON_PARTIAL_OUTPUT_ON_ERROR (1<<9) -#define PHP_JSON_PRESERVE_ZERO_FRACTION (1<<10) +#define PHP_JSON_HEX_TAG (1<<0) +#define PHP_JSON_HEX_AMP (1<<1) +#define PHP_JSON_HEX_APOS (1<<2) +#define PHP_JSON_HEX_QUOT (1<<3) +#define PHP_JSON_FORCE_OBJECT (1<<4) +#define PHP_JSON_NUMERIC_CHECK (1<<5) +#define PHP_JSON_UNESCAPED_SLASHES (1<<6) +#define PHP_JSON_PRETTY_PRINT (1<<7) +#define PHP_JSON_UNESCAPED_UNICODE (1<<8) +#define PHP_JSON_PARTIAL_OUTPUT_ON_ERROR (1<<9) +#define PHP_JSON_PRESERVE_ZERO_FRACTION (1<<10) #define PHP_JSON_UNESCAPED_LINE_TERMINATORS (1<<11) -/* json_decode() options */ -#define PHP_JSON_OBJECT_AS_ARRAY (1<<0) -#define PHP_JSON_BIGINT_AS_STRING (1<<1) +/* json_decode() and json_encode() common options */ +#define PHP_JSON_INVALID_UTF8_IGNORE (1<<20) +#define PHP_JSON_INVALID_UTF8_SUBSTITUTE (1<<21) /* Internal flags */ #define PHP_JSON_OUTPUT_ARRAY 0 diff --git a/ext/json/tests/json_encode_invalid_utf8.phpt b/ext/json/tests/json_encode_invalid_utf8.phpt new file mode 100644 index 0000000000000..888b6ad7e8754 --- /dev/null +++ b/ext/json/tests/json_encode_invalid_utf8.phpt @@ -0,0 +1,23 @@ +--TEST-- +json_encode() invalide UTF8 +--SKIPIF-- + +--FILE-- + +--EXPECTF-- +bool(false) +string(8) ""foobar"" +string(14) ""foo\ufffdbar"" +bool(false) +string(9) "%s" +Done From 032fd0c4d08415f745e69c05f95d8394f540696b Mon Sep 17 00:00:00 2001 From: Jakub Zelenka Date: Sun, 25 Jun 2017 17:20:03 +0100 Subject: [PATCH 2/8] Prepare invlid UTF8 substitution in JSON scanner --- ext/json/json_scanner.c | 8 ++++++++ ext/json/json_scanner.re | 8 ++++++++ ext/json/php_json_scanner.h | 6 ++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/ext/json/json_scanner.c b/ext/json/json_scanner.c index 7104c3ea61f1f..60dde3563e097 100644 --- a/ext/json/json_scanner.c +++ b/ext/json/json_scanner.c @@ -623,6 +623,14 @@ int php_json_scan(php_json_scanner *s) ++YYCURSOR; yy80: { + if (s->options & PHP_JSON_INVALID_UTF8_IGNORE) { + PHP_JSON_CONDITION_GOTO(STR_P1); + } + if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) { + s->utf8_sub_needed = 1; + s->utf8_sub_len += 2 - (s->cursor - s->token); + PHP_JSON_CONDITION_GOTO(STR_P1); + } s->errcode = PHP_JSON_ERROR_UTF8; return PHP_JSON_T_ERROR; } diff --git a/ext/json/json_scanner.re b/ext/json/json_scanner.re index be0000b8b945b..580ef745c8483 100644 --- a/ext/json/json_scanner.re +++ b/ext/json/json_scanner.re @@ -269,6 +269,14 @@ std: } UTF8 { PHP_JSON_CONDITION_GOTO(STR_P1); } ANY { + if (s->options & PHP_JSON_INVALID_UTF8_IGNORE) { + PHP_JSON_CONDITION_GOTO(STR_P1); + } + if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) { + s->utf8_sub_needed = 1; + s->utf8_sub_len += 2 - (s->cursor - s->token); + PHP_JSON_CONDITION_GOTO(STR_P1); + } s->errcode = PHP_JSON_ERROR_UTF8; return PHP_JSON_T_ERROR; } diff --git a/ext/json/php_json_scanner.h b/ext/json/php_json_scanner.h index f13483d2b796c..6f962b4406d6c 100644 --- a/ext/json/php_json_scanner.h +++ b/ext/json/php_json_scanner.h @@ -34,9 +34,11 @@ typedef struct _php_json_scanner { php_json_ctype *pstr; /* string pointer for escapes conversion */ zval value; /* value */ int str_esc; /* number of extra characters for escaping */ - int state; /* condition state */ - int options; /* options */ + int state; /* condition state */ + int options; /* options */ php_json_error_code errcode; /* error type if there is an error */ + int utf8_sub_needed; /* whether utf8 substitution is needed */ + int utf8_sub_len; /* how many extra character is needed (can be negative) */ } php_json_scanner; From 4d4a1ec8de9fcf18b347df8e21962a65279b09a7 Mon Sep 17 00:00:00 2001 From: Jakub Zelenka Date: Wed, 28 Jun 2017 20:39:30 +0100 Subject: [PATCH 3/8] Split STR_P2 condition in JSON scanner --- ext/json/json_scanner.c | 691 ++++++++++++++++++++++++------- ext/json/json_scanner.re | 50 ++- ext/json/php_json_scanner_defs.h | 3 +- 3 files changed, 578 insertions(+), 166 deletions(-) diff --git a/ext/json/json_scanner.c b/ext/json/json_scanner.c index 60dde3563e097..fd824d4dd8b5c 100644 --- a/ext/json/json_scanner.c +++ b/ext/json/json_scanner.c @@ -36,6 +36,18 @@ #define PHP_JSON_CONDITION_SET(condition) YYSETCONDITION(yyc##condition) #define PHP_JSON_CONDITION_GOTO(condition) goto yyc_##condition +#define PHP_JSON_CONDITION_SET_AND_GOTO(condition) \ + PHP_JSON_CONDITION_SET(condition); \ + PHP_JSON_CONDITION_GOTO(condition) +#define PHP_JSON_CONDITION_GOTO_STR_P2() \ + do { \ + if (s->utf8_sub_needed) { \ + PHP_JSON_CONDITION_GOTO(STR_P2_BIN); \ + } else { \ + PHP_JSON_CONDITION_GOTO(STR_P2_UTF); \ + } \ + } while(0) + #define PHP_JSON_SCANNER_COPY_ESC() php_json_scanner_copy_string(s, 0) #define PHP_JSON_SCANNER_COPY_UTF() php_json_scanner_copy_string(s, 5) @@ -101,13 +113,17 @@ int php_json_scan(php_json_scanner *s) { YYCTYPE yych; unsigned int yyaccept = 0; - if (YYGETCONDITION() < 1) { - goto yyc_JS; - } else { - if (YYGETCONDITION() < 2) { + if (YYGETCONDITION() < 2) { + if (YYGETCONDITION() < 1) { + goto yyc_JS; + } else { goto yyc_STR_P1; + } + } else { + if (YYGETCONDITION() < 3) { + goto yyc_STR_P2_BIN; } else { - goto yyc_STR_P2; + goto yyc_STR_P2_UTF; } } /* *********************************** */ @@ -276,8 +292,7 @@ int php_json_scan(php_json_scanner *s) { s->str_start = s->cursor; s->str_esc = 0; - PHP_JSON_CONDITION_SET(STR_P1); - PHP_JSON_CONDITION_GOTO(STR_P1); + PHP_JSON_CONDITION_SET_AND_GOTO(STR_P1); } yy16: ++YYCURSOR; @@ -579,8 +594,11 @@ int php_json_scan(php_json_scanner *s) if (s->str_esc) { s->pstr = (php_json_ctype *) Z_STRVAL(s->value); s->cursor = s->str_start; - PHP_JSON_CONDITION_SET(STR_P2); - PHP_JSON_CONDITION_GOTO(STR_P2); + if (s->utf8_sub_needed) { + PHP_JSON_CONDITION_SET_AND_GOTO(STR_P2_BIN); + } else { + PHP_JSON_CONDITION_SET_AND_GOTO(STR_P2_UTF); + } } else { memcpy(Z_STRVAL(s->value), s->str_start, len); PHP_JSON_CONDITION_SET(JS); @@ -963,12 +981,32 @@ int php_json_scan(php_json_scanner *s) PHP_JSON_CONDITION_GOTO(STR_P1); } /* *********************************** */ -yyc_STR_P2: +yyc_STR_P2_BIN: yych = *YYCURSOR; - if (yych == '"') goto yy127; - if (yych == '\\') goto yy129; + if (yych <= 0xDF) { + if (yych <= '[') { + if (yych == '"') goto yy127; + } else { + if (yych <= '\\') goto yy129; + if (yych <= 0x7F) goto yy125; + if (yych <= 0xC1) goto yy131; + goto yy133; + } + } else { + if (yych <= 0xEF) { + if (yych <= 0xE0) goto yy134; + if (yych == 0xED) goto yy136; + goto yy135; + } else { + if (yych <= 0xF0) goto yy137; + if (yych <= 0xF3) goto yy138; + if (yych <= 0xF4) goto yy139; + goto yy131; + } + } +yy125: ++YYCURSOR; - { PHP_JSON_CONDITION_GOTO(STR_P2); } + { PHP_JSON_CONDITION_GOTO(STR_P2_BIN); } yy127: ++YYCURSOR; YYSETCONDITION(yycJS); @@ -977,8 +1015,9 @@ int php_json_scan(php_json_scanner *s) return PHP_JSON_T_STRING; } yy129: + yyaccept = 0; yych = *(YYMARKER = ++YYCURSOR); - if (yych == 'u') goto yy131; + if (yych == 'u') goto yy140; yy130: { char esc; @@ -1011,181 +1050,242 @@ int php_json_scan(php_json_scanner *s) *(s->pstr++) = esc; ++YYCURSOR; s->str_start = s->cursor; - PHP_JSON_CONDITION_GOTO(STR_P2); + PHP_JSON_CONDITION_GOTO_STR_P2(); } yy131: + ++YYCURSOR; +yy132: + { + PHP_JSON_CONDITION_GOTO(STR_P2_BIN); + } +yy133: + yych = *++YYCURSOR; + if (yych <= 0x7F) goto yy132; + if (yych <= 0xBF) goto yy125; + goto yy132; +yy134: + yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= 0x9F) goto yy132; + if (yych <= 0xBF) goto yy142; + goto yy132; +yy135: + yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= 0x7F) goto yy132; + if (yych <= 0xBF) goto yy142; + goto yy132; +yy136: + yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= 0x7F) goto yy132; + if (yych <= 0x9F) goto yy142; + goto yy132; +yy137: + yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= 0x8F) goto yy132; + if (yych <= 0xBF) goto yy143; + goto yy132; +yy138: + yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= 0x7F) goto yy132; + if (yych <= 0xBF) goto yy143; + goto yy132; +yy139: + yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= 0x7F) goto yy132; + if (yych <= 0x8F) goto yy143; + goto yy132; +yy140: yych = *++YYCURSOR; if (yych <= 'D') { if (yych <= '9') { - if (yych <= '/') goto yy132; - if (yych <= '0') goto yy133; - goto yy134; + if (yych <= '/') goto yy141; + if (yych <= '0') goto yy144; + goto yy145; } else { - if (yych <= '@') goto yy132; - if (yych <= 'C') goto yy134; - goto yy135; + if (yych <= '@') goto yy141; + if (yych <= 'C') goto yy145; + goto yy146; } } else { if (yych <= 'c') { - if (yych <= 'F') goto yy134; - if (yych >= 'a') goto yy134; + if (yych <= 'F') goto yy145; + if (yych >= 'a') goto yy145; } else { - if (yych <= 'd') goto yy135; - if (yych <= 'f') goto yy134; + if (yych <= 'd') goto yy146; + if (yych <= 'f') goto yy145; } } -yy132: +yy141: YYCURSOR = YYMARKER; - goto yy130; -yy133: + if (yyaccept == 0) { + goto yy130; + } else { + goto yy132; + } +yy142: + yych = *++YYCURSOR; + if (yych <= 0x7F) goto yy141; + if (yych <= 0xBF) goto yy125; + goto yy141; +yy143: + yych = *++YYCURSOR; + if (yych <= 0x7F) goto yy141; + if (yych <= 0xBF) goto yy142; + goto yy141; +yy144: yych = *++YYCURSOR; if (yych <= '9') { - if (yych <= '/') goto yy132; - if (yych <= '0') goto yy136; - if (yych <= '7') goto yy137; - goto yy138; + if (yych <= '/') goto yy141; + if (yych <= '0') goto yy147; + if (yych <= '7') goto yy148; + goto yy149; } else { if (yych <= 'F') { - if (yych <= '@') goto yy132; - goto yy138; + if (yych <= '@') goto yy141; + goto yy149; } else { - if (yych <= '`') goto yy132; - if (yych <= 'f') goto yy138; - goto yy132; + if (yych <= '`') goto yy141; + if (yych <= 'f') goto yy149; + goto yy141; } } -yy134: +yy145: yych = *++YYCURSOR; if (yych <= '@') { - if (yych <= '/') goto yy132; - if (yych <= '9') goto yy138; - goto yy132; + if (yych <= '/') goto yy141; + if (yych <= '9') goto yy149; + goto yy141; } else { - if (yych <= 'F') goto yy138; - if (yych <= '`') goto yy132; - if (yych <= 'f') goto yy138; - goto yy132; + if (yych <= 'F') goto yy149; + if (yych <= '`') goto yy141; + if (yych <= 'f') goto yy149; + goto yy141; } -yy135: +yy146: yych = *++YYCURSOR; if (yych <= '@') { - if (yych <= '/') goto yy132; - if (yych <= '7') goto yy138; - if (yych <= '9') goto yy139; - goto yy132; + if (yych <= '/') goto yy141; + if (yych <= '7') goto yy149; + if (yych <= '9') goto yy150; + goto yy141; } else { - if (yych <= 'B') goto yy139; - if (yych <= '`') goto yy132; - if (yych <= 'b') goto yy139; - goto yy132; + if (yych <= 'B') goto yy150; + if (yych <= '`') goto yy141; + if (yych <= 'b') goto yy150; + goto yy141; } -yy136: +yy147: yych = *++YYCURSOR; if (yych <= '@') { - if (yych <= '/') goto yy132; - if (yych <= '7') goto yy140; - if (yych <= '9') goto yy141; - goto yy132; + if (yych <= '/') goto yy141; + if (yych <= '7') goto yy151; + if (yych <= '9') goto yy152; + goto yy141; } else { - if (yych <= 'F') goto yy141; - if (yych <= '`') goto yy132; - if (yych <= 'f') goto yy141; - goto yy132; + if (yych <= 'F') goto yy152; + if (yych <= '`') goto yy141; + if (yych <= 'f') goto yy152; + goto yy141; } -yy137: +yy148: yych = *++YYCURSOR; if (yych <= '@') { - if (yych <= '/') goto yy132; - if (yych <= '9') goto yy141; - goto yy132; + if (yych <= '/') goto yy141; + if (yych <= '9') goto yy152; + goto yy141; } else { - if (yych <= 'F') goto yy141; - if (yych <= '`') goto yy132; - if (yych <= 'f') goto yy141; - goto yy132; + if (yych <= 'F') goto yy152; + if (yych <= '`') goto yy141; + if (yych <= 'f') goto yy152; + goto yy141; } -yy138: +yy149: yych = *++YYCURSOR; if (yych <= '@') { - if (yych <= '/') goto yy132; - if (yych <= '9') goto yy142; - goto yy132; + if (yych <= '/') goto yy141; + if (yych <= '9') goto yy153; + goto yy141; } else { - if (yych <= 'F') goto yy142; - if (yych <= '`') goto yy132; - if (yych <= 'f') goto yy142; - goto yy132; + if (yych <= 'F') goto yy153; + if (yych <= '`') goto yy141; + if (yych <= 'f') goto yy153; + goto yy141; } -yy139: +yy150: yych = *++YYCURSOR; if (yych <= '@') { - if (yych <= '/') goto yy132; - if (yych <= '9') goto yy143; - goto yy132; + if (yych <= '/') goto yy141; + if (yych <= '9') goto yy154; + goto yy141; } else { - if (yych <= 'F') goto yy143; - if (yych <= '`') goto yy132; - if (yych <= 'f') goto yy143; - goto yy132; + if (yych <= 'F') goto yy154; + if (yych <= '`') goto yy141; + if (yych <= 'f') goto yy154; + goto yy141; } -yy140: +yy151: yych = *++YYCURSOR; if (yych <= '@') { - if (yych <= '/') goto yy132; - if (yych <= '9') goto yy144; - goto yy132; + if (yych <= '/') goto yy141; + if (yych <= '9') goto yy155; + goto yy141; } else { - if (yych <= 'F') goto yy144; - if (yych <= '`') goto yy132; - if (yych <= 'f') goto yy144; - goto yy132; + if (yych <= 'F') goto yy155; + if (yych <= '`') goto yy141; + if (yych <= 'f') goto yy155; + goto yy141; } -yy141: +yy152: yych = *++YYCURSOR; if (yych <= '@') { - if (yych <= '/') goto yy132; - if (yych <= '9') goto yy146; - goto yy132; + if (yych <= '/') goto yy141; + if (yych <= '9') goto yy157; + goto yy141; } else { - if (yych <= 'F') goto yy146; - if (yych <= '`') goto yy132; - if (yych <= 'f') goto yy146; - goto yy132; + if (yych <= 'F') goto yy157; + if (yych <= '`') goto yy141; + if (yych <= 'f') goto yy157; + goto yy141; } -yy142: +yy153: yych = *++YYCURSOR; if (yych <= '@') { - if (yych <= '/') goto yy132; - if (yych <= '9') goto yy148; - goto yy132; + if (yych <= '/') goto yy141; + if (yych <= '9') goto yy159; + goto yy141; } else { - if (yych <= 'F') goto yy148; - if (yych <= '`') goto yy132; - if (yych <= 'f') goto yy148; - goto yy132; + if (yych <= 'F') goto yy159; + if (yych <= '`') goto yy141; + if (yych <= 'f') goto yy159; + goto yy141; } -yy143: +yy154: yych = *++YYCURSOR; if (yych <= '@') { - if (yych <= '/') goto yy132; - if (yych <= '9') goto yy150; - goto yy132; + if (yych <= '/') goto yy141; + if (yych <= '9') goto yy161; + goto yy141; } else { - if (yych <= 'F') goto yy150; - if (yych <= '`') goto yy132; - if (yych <= 'f') goto yy150; - goto yy132; + if (yych <= 'F') goto yy161; + if (yych <= '`') goto yy141; + if (yych <= 'f') goto yy161; + goto yy141; } -yy144: +yy155: ++YYCURSOR; { int utf16 = php_json_ucs2_to_int(s, 2); PHP_JSON_SCANNER_COPY_UTF(); *(s->pstr++) = (char) utf16; s->str_start = s->cursor; - PHP_JSON_CONDITION_GOTO(STR_P2); + PHP_JSON_CONDITION_GOTO_STR_P2(); } -yy146: +yy157: ++YYCURSOR; { int utf16 = php_json_ucs2_to_int(s, 3); @@ -1193,9 +1293,9 @@ int php_json_scan(php_json_scanner *s) *(s->pstr++) = (char) (0xc0 | (utf16 >> 6)); *(s->pstr++) = (char) (0x80 | (utf16 & 0x3f)); s->str_start = s->cursor; - PHP_JSON_CONDITION_GOTO(STR_P2); + PHP_JSON_CONDITION_GOTO_STR_P2(); } -yy148: +yy159: ++YYCURSOR; { int utf16 = php_json_ucs2_to_int(s, 4); @@ -1204,43 +1304,336 @@ int php_json_scan(php_json_scanner *s) *(s->pstr++) = (char) (0x80 | ((utf16 >> 6) & 0x3f)); *(s->pstr++) = (char) (0x80 | (utf16 & 0x3f)); s->str_start = s->cursor; - PHP_JSON_CONDITION_GOTO(STR_P2); + PHP_JSON_CONDITION_GOTO_STR_P2(); } -yy150: +yy161: yych = *++YYCURSOR; - if (yych != '\\') goto yy132; + if (yych != '\\') goto yy141; yych = *++YYCURSOR; - if (yych != 'u') goto yy132; + if (yych != 'u') goto yy141; yych = *++YYCURSOR; - if (yych == 'D') goto yy153; - if (yych != 'd') goto yy132; -yy153: + if (yych == 'D') goto yy164; + if (yych != 'd') goto yy141; +yy164: yych = *++YYCURSOR; - if (yych <= 'B') goto yy132; - if (yych <= 'F') goto yy154; - if (yych <= 'b') goto yy132; - if (yych >= 'g') goto yy132; -yy154: + if (yych <= 'B') goto yy141; + if (yych <= 'F') goto yy165; + if (yych <= 'b') goto yy141; + if (yych >= 'g') goto yy141; +yy165: yych = *++YYCURSOR; if (yych <= '@') { - if (yych <= '/') goto yy132; - if (yych >= ':') goto yy132; + if (yych <= '/') goto yy141; + if (yych >= ':') goto yy141; } else { - if (yych <= 'F') goto yy155; - if (yych <= '`') goto yy132; - if (yych >= 'g') goto yy132; + if (yych <= 'F') goto yy166; + if (yych <= '`') goto yy141; + if (yych >= 'g') goto yy141; } -yy155: +yy166: + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '/') goto yy141; + if (yych >= ':') goto yy141; + } else { + if (yych <= 'F') goto yy167; + if (yych <= '`') goto yy141; + if (yych >= 'g') goto yy141; + } +yy167: + ++YYCURSOR; + { + int utf32, utf16_hi, utf16_lo; + utf16_hi = php_json_ucs2_to_int(s, 4); + utf16_lo = php_json_ucs2_to_int_ex(s, 4, 7); + utf32 = ((utf16_lo & 0x3FF) << 10) + (utf16_hi & 0x3FF) + 0x10000; + PHP_JSON_SCANNER_COPY_UTF_SP(); + *(s->pstr++) = (char) (0xf0 | (utf32 >> 18)); + *(s->pstr++) = (char) (0x80 | ((utf32 >> 12) & 0x3f)); + *(s->pstr++) = (char) (0x80 | ((utf32 >> 6) & 0x3f)); + *(s->pstr++) = (char) (0x80 | (utf32 & 0x3f)); + s->str_start = s->cursor; + PHP_JSON_CONDITION_GOTO_STR_P2(); + } +/* *********************************** */ +yyc_STR_P2_UTF: + yych = *YYCURSOR; + if (yych == '"') goto yy173; + if (yych == '\\') goto yy175; + ++YYCURSOR; + { PHP_JSON_CONDITION_GOTO(STR_P2_UTF); } +yy173: + ++YYCURSOR; + YYSETCONDITION(yycJS); + { + PHP_JSON_SCANNER_COPY_ESC(); + return PHP_JSON_T_STRING; + } +yy175: + yych = *(YYMARKER = ++YYCURSOR); + if (yych == 'u') goto yy177; +yy176: + { + char esc; + PHP_JSON_SCANNER_COPY_ESC(); + switch (*s->cursor) { + case 'b': + esc = '\b'; + break; + case 'f': + esc = '\f'; + break; + case 'n': + esc = '\n'; + break; + case 'r': + esc = '\r'; + break; + case 't': + esc = '\t'; + break; + case '\\': + case '/': + case '"': + esc = *s->cursor; + break; + default: + s->errcode = PHP_JSON_ERROR_SYNTAX; + return PHP_JSON_T_ERROR; + } + *(s->pstr++) = esc; + ++YYCURSOR; + s->str_start = s->cursor; + PHP_JSON_CONDITION_GOTO_STR_P2(); + } +yy177: + yych = *++YYCURSOR; + if (yych <= 'D') { + if (yych <= '9') { + if (yych <= '/') goto yy178; + if (yych <= '0') goto yy179; + goto yy180; + } else { + if (yych <= '@') goto yy178; + if (yych <= 'C') goto yy180; + goto yy181; + } + } else { + if (yych <= 'c') { + if (yych <= 'F') goto yy180; + if (yych >= 'a') goto yy180; + } else { + if (yych <= 'd') goto yy181; + if (yych <= 'f') goto yy180; + } + } +yy178: + YYCURSOR = YYMARKER; + goto yy176; +yy179: + yych = *++YYCURSOR; + if (yych <= '9') { + if (yych <= '/') goto yy178; + if (yych <= '0') goto yy182; + if (yych <= '7') goto yy183; + goto yy184; + } else { + if (yych <= 'F') { + if (yych <= '@') goto yy178; + goto yy184; + } else { + if (yych <= '`') goto yy178; + if (yych <= 'f') goto yy184; + goto yy178; + } + } +yy180: + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '/') goto yy178; + if (yych <= '9') goto yy184; + goto yy178; + } else { + if (yych <= 'F') goto yy184; + if (yych <= '`') goto yy178; + if (yych <= 'f') goto yy184; + goto yy178; + } +yy181: + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '/') goto yy178; + if (yych <= '7') goto yy184; + if (yych <= '9') goto yy185; + goto yy178; + } else { + if (yych <= 'B') goto yy185; + if (yych <= '`') goto yy178; + if (yych <= 'b') goto yy185; + goto yy178; + } +yy182: + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '/') goto yy178; + if (yych <= '7') goto yy186; + if (yych <= '9') goto yy187; + goto yy178; + } else { + if (yych <= 'F') goto yy187; + if (yych <= '`') goto yy178; + if (yych <= 'f') goto yy187; + goto yy178; + } +yy183: + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '/') goto yy178; + if (yych <= '9') goto yy187; + goto yy178; + } else { + if (yych <= 'F') goto yy187; + if (yych <= '`') goto yy178; + if (yych <= 'f') goto yy187; + goto yy178; + } +yy184: + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '/') goto yy178; + if (yych <= '9') goto yy188; + goto yy178; + } else { + if (yych <= 'F') goto yy188; + if (yych <= '`') goto yy178; + if (yych <= 'f') goto yy188; + goto yy178; + } +yy185: + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '/') goto yy178; + if (yych <= '9') goto yy189; + goto yy178; + } else { + if (yych <= 'F') goto yy189; + if (yych <= '`') goto yy178; + if (yych <= 'f') goto yy189; + goto yy178; + } +yy186: + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '/') goto yy178; + if (yych <= '9') goto yy190; + goto yy178; + } else { + if (yych <= 'F') goto yy190; + if (yych <= '`') goto yy178; + if (yych <= 'f') goto yy190; + goto yy178; + } +yy187: + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '/') goto yy178; + if (yych <= '9') goto yy192; + goto yy178; + } else { + if (yych <= 'F') goto yy192; + if (yych <= '`') goto yy178; + if (yych <= 'f') goto yy192; + goto yy178; + } +yy188: + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '/') goto yy178; + if (yych <= '9') goto yy194; + goto yy178; + } else { + if (yych <= 'F') goto yy194; + if (yych <= '`') goto yy178; + if (yych <= 'f') goto yy194; + goto yy178; + } +yy189: + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '/') goto yy178; + if (yych <= '9') goto yy196; + goto yy178; + } else { + if (yych <= 'F') goto yy196; + if (yych <= '`') goto yy178; + if (yych <= 'f') goto yy196; + goto yy178; + } +yy190: + ++YYCURSOR; + { + int utf16 = php_json_ucs2_to_int(s, 2); + PHP_JSON_SCANNER_COPY_UTF(); + *(s->pstr++) = (char) utf16; + s->str_start = s->cursor; + PHP_JSON_CONDITION_GOTO_STR_P2(); + } +yy192: + ++YYCURSOR; + { + int utf16 = php_json_ucs2_to_int(s, 3); + PHP_JSON_SCANNER_COPY_UTF(); + *(s->pstr++) = (char) (0xc0 | (utf16 >> 6)); + *(s->pstr++) = (char) (0x80 | (utf16 & 0x3f)); + s->str_start = s->cursor; + PHP_JSON_CONDITION_GOTO_STR_P2(); + } +yy194: + ++YYCURSOR; + { + int utf16 = php_json_ucs2_to_int(s, 4); + PHP_JSON_SCANNER_COPY_UTF(); + *(s->pstr++) = (char) (0xe0 | (utf16 >> 12)); + *(s->pstr++) = (char) (0x80 | ((utf16 >> 6) & 0x3f)); + *(s->pstr++) = (char) (0x80 | (utf16 & 0x3f)); + s->str_start = s->cursor; + PHP_JSON_CONDITION_GOTO_STR_P2(); + } +yy196: + yych = *++YYCURSOR; + if (yych != '\\') goto yy178; + yych = *++YYCURSOR; + if (yych != 'u') goto yy178; + yych = *++YYCURSOR; + if (yych == 'D') goto yy199; + if (yych != 'd') goto yy178; +yy199: + yych = *++YYCURSOR; + if (yych <= 'B') goto yy178; + if (yych <= 'F') goto yy200; + if (yych <= 'b') goto yy178; + if (yych >= 'g') goto yy178; +yy200: + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '/') goto yy178; + if (yych >= ':') goto yy178; + } else { + if (yych <= 'F') goto yy201; + if (yych <= '`') goto yy178; + if (yych >= 'g') goto yy178; + } +yy201: yych = *++YYCURSOR; if (yych <= '@') { - if (yych <= '/') goto yy132; - if (yych >= ':') goto yy132; + if (yych <= '/') goto yy178; + if (yych >= ':') goto yy178; } else { - if (yych <= 'F') goto yy156; - if (yych <= '`') goto yy132; - if (yych >= 'g') goto yy132; + if (yych <= 'F') goto yy202; + if (yych <= '`') goto yy178; + if (yych >= 'g') goto yy178; } -yy156: +yy202: ++YYCURSOR; { int utf32, utf16_hi, utf16_lo; @@ -1253,7 +1646,7 @@ int php_json_scan(php_json_scanner *s) *(s->pstr++) = (char) (0x80 | ((utf32 >> 6) & 0x3f)); *(s->pstr++) = (char) (0x80 | (utf32 & 0x3f)); s->str_start = s->cursor; - PHP_JSON_CONDITION_GOTO(STR_P2); + PHP_JSON_CONDITION_GOTO_STR_P2(); } } diff --git a/ext/json/json_scanner.re b/ext/json/json_scanner.re index 580ef745c8483..6f23a4c41ce08 100644 --- a/ext/json/json_scanner.re +++ b/ext/json/json_scanner.re @@ -35,6 +35,18 @@ #define PHP_JSON_CONDITION_SET(condition) YYSETCONDITION(yyc##condition) #define PHP_JSON_CONDITION_GOTO(condition) goto yyc_##condition +#define PHP_JSON_CONDITION_SET_AND_GOTO(condition) \ + PHP_JSON_CONDITION_SET(condition); \ + PHP_JSON_CONDITION_GOTO(condition) +#define PHP_JSON_CONDITION_GOTO_STR_P2() \ + do { \ + if (s->utf8_sub_needed) { \ + PHP_JSON_CONDITION_GOTO(STR_P2_BIN); \ + } else { \ + PHP_JSON_CONDITION_GOTO(STR_P2_UTF); \ + } \ + } while(0) + #define PHP_JSON_SCANNER_COPY_ESC() php_json_scanner_copy_string(s, 0) #define PHP_JSON_SCANNER_COPY_UTF() php_json_scanner_copy_string(s, 5) @@ -197,8 +209,7 @@ std: ["] { s->str_start = s->cursor; s->str_esc = 0; - PHP_JSON_CONDITION_SET(STR_P1); - PHP_JSON_CONDITION_GOTO(STR_P1); + PHP_JSON_CONDITION_SET_AND_GOTO(STR_P1); } CTRL { s->errcode = PHP_JSON_ERROR_CTRL_CHAR; @@ -259,8 +270,11 @@ std: if (s->str_esc) { s->pstr = (php_json_ctype *) Z_STRVAL(s->value); s->cursor = s->str_start; - PHP_JSON_CONDITION_SET(STR_P2); - PHP_JSON_CONDITION_GOTO(STR_P2); + if (s->utf8_sub_needed) { + PHP_JSON_CONDITION_SET_AND_GOTO(STR_P2_BIN); + } else { + PHP_JSON_CONDITION_SET_AND_GOTO(STR_P2_UTF); + } } else { memcpy(Z_STRVAL(s->value), s->str_start, len); PHP_JSON_CONDITION_SET(JS); @@ -281,31 +295,31 @@ std: return PHP_JSON_T_ERROR; } - UTF16_1 { + UTF16_1 { int utf16 = php_json_ucs2_to_int(s, 2); PHP_JSON_SCANNER_COPY_UTF(); *(s->pstr++) = (char) utf16; s->str_start = s->cursor; - PHP_JSON_CONDITION_GOTO(STR_P2); + PHP_JSON_CONDITION_GOTO_STR_P2(); } - UTF16_2 { + UTF16_2 { int utf16 = php_json_ucs2_to_int(s, 3); PHP_JSON_SCANNER_COPY_UTF(); *(s->pstr++) = (char) (0xc0 | (utf16 >> 6)); *(s->pstr++) = (char) (0x80 | (utf16 & 0x3f)); s->str_start = s->cursor; - PHP_JSON_CONDITION_GOTO(STR_P2); + PHP_JSON_CONDITION_GOTO_STR_P2(); } - UTF16_3 { + UTF16_3 { int utf16 = php_json_ucs2_to_int(s, 4); PHP_JSON_SCANNER_COPY_UTF(); *(s->pstr++) = (char) (0xe0 | (utf16 >> 12)); *(s->pstr++) = (char) (0x80 | ((utf16 >> 6) & 0x3f)); *(s->pstr++) = (char) (0x80 | (utf16 & 0x3f)); s->str_start = s->cursor; - PHP_JSON_CONDITION_GOTO(STR_P2); + PHP_JSON_CONDITION_GOTO_STR_P2(); } - UTF16_4 { + UTF16_4 { int utf32, utf16_hi, utf16_lo; utf16_hi = php_json_ucs2_to_int(s, 4); utf16_lo = php_json_ucs2_to_int_ex(s, 4, 7); @@ -316,9 +330,9 @@ std: *(s->pstr++) = (char) (0x80 | ((utf32 >> 6) & 0x3f)); *(s->pstr++) = (char) (0x80 | (utf32 & 0x3f)); s->str_start = s->cursor; - PHP_JSON_CONDITION_GOTO(STR_P2); + PHP_JSON_CONDITION_GOTO_STR_P2(); } - ESCPREF { + ESCPREF { char esc; PHP_JSON_SCANNER_COPY_ESC(); switch (*s->cursor) { @@ -349,13 +363,17 @@ std: *(s->pstr++) = esc; ++YYCURSOR; s->str_start = s->cursor; - PHP_JSON_CONDITION_GOTO(STR_P2); + PHP_JSON_CONDITION_GOTO_STR_P2(); } - ["] => JS { + ["] => JS { PHP_JSON_SCANNER_COPY_ESC(); return PHP_JSON_T_STRING; } - ANY { PHP_JSON_CONDITION_GOTO(STR_P2); } + UTF8 { PHP_JSON_CONDITION_GOTO(STR_P2_BIN); } + ANY { + PHP_JSON_CONDITION_GOTO(STR_P2_BIN); + } + ANY { PHP_JSON_CONDITION_GOTO(STR_P2_UTF); } <*>ANY { s->errcode = PHP_JSON_ERROR_SYNTAX; diff --git a/ext/json/php_json_scanner_defs.h b/ext/json/php_json_scanner_defs.h index d62dd9137af86..9f464722766cc 100644 --- a/ext/json/php_json_scanner_defs.h +++ b/ext/json/php_json_scanner_defs.h @@ -3,5 +3,6 @@ enum YYCONDTYPE { yycJS, yycSTR_P1, - yycSTR_P2, + yycSTR_P2_BIN, + yycSTR_P2_UTF, }; From 3c4c02ba5291704575d824da39a9082e6a3ece5a Mon Sep 17 00:00:00 2001 From: Jakub Zelenka Date: Sun, 2 Jul 2017 15:45:48 +0100 Subject: [PATCH 4/8] Simplify decoder invalid UTF8 substitution --- ext/json/json_scanner.c | 13 +++++++++---- ext/json/json_scanner.re | 13 +++++++++---- ext/json/php_json_scanner.h | 3 +-- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/ext/json/json_scanner.c b/ext/json/json_scanner.c index fd824d4dd8b5c..6f6766829b092 100644 --- a/ext/json/json_scanner.c +++ b/ext/json/json_scanner.c @@ -41,7 +41,7 @@ PHP_JSON_CONDITION_GOTO(condition) #define PHP_JSON_CONDITION_GOTO_STR_P2() \ do { \ - if (s->utf8_sub_needed) { \ + if (s->utf8_sub) { \ PHP_JSON_CONDITION_GOTO(STR_P2_BIN); \ } else { \ PHP_JSON_CONDITION_GOTO(STR_P2_UTF); \ @@ -594,7 +594,7 @@ int php_json_scan(php_json_scanner *s) if (s->str_esc) { s->pstr = (php_json_ctype *) Z_STRVAL(s->value); s->cursor = s->str_start; - if (s->utf8_sub_needed) { + if (s->utf8_sub) { PHP_JSON_CONDITION_SET_AND_GOTO(STR_P2_BIN); } else { PHP_JSON_CONDITION_SET_AND_GOTO(STR_P2_UTF); @@ -645,8 +645,7 @@ int php_json_scan(php_json_scanner *s) PHP_JSON_CONDITION_GOTO(STR_P1); } if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) { - s->utf8_sub_needed = 1; - s->utf8_sub_len += 2 - (s->cursor - s->token); + s->utf8_sub = 1; PHP_JSON_CONDITION_GOTO(STR_P1); } s->errcode = PHP_JSON_ERROR_UTF8; @@ -1056,6 +1055,12 @@ int php_json_scan(php_json_scanner *s) ++YYCURSOR; yy132: { + if (s->utf8_sub) { + php_json_scanner_copy_string(s, 2 - (s->cursor - s->token)); + *(s->pstr++) = (char) (0xc0 | (0xfffd >> 6)); + *(s->pstr++) = (char) (0x80 | (0xfffd & 0x3f)); + s->str_start = s->cursor; + } PHP_JSON_CONDITION_GOTO(STR_P2_BIN); } yy133: diff --git a/ext/json/json_scanner.re b/ext/json/json_scanner.re index 6f23a4c41ce08..03aa37e49f75f 100644 --- a/ext/json/json_scanner.re +++ b/ext/json/json_scanner.re @@ -40,7 +40,7 @@ PHP_JSON_CONDITION_GOTO(condition) #define PHP_JSON_CONDITION_GOTO_STR_P2() \ do { \ - if (s->utf8_sub_needed) { \ + if (s->utf8_sub) { \ PHP_JSON_CONDITION_GOTO(STR_P2_BIN); \ } else { \ PHP_JSON_CONDITION_GOTO(STR_P2_UTF); \ @@ -270,7 +270,7 @@ std: if (s->str_esc) { s->pstr = (php_json_ctype *) Z_STRVAL(s->value); s->cursor = s->str_start; - if (s->utf8_sub_needed) { + if (s->utf8_sub) { PHP_JSON_CONDITION_SET_AND_GOTO(STR_P2_BIN); } else { PHP_JSON_CONDITION_SET_AND_GOTO(STR_P2_UTF); @@ -287,8 +287,7 @@ std: PHP_JSON_CONDITION_GOTO(STR_P1); } if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) { - s->utf8_sub_needed = 1; - s->utf8_sub_len += 2 - (s->cursor - s->token); + s->utf8_sub = 1; PHP_JSON_CONDITION_GOTO(STR_P1); } s->errcode = PHP_JSON_ERROR_UTF8; @@ -371,6 +370,12 @@ std: } UTF8 { PHP_JSON_CONDITION_GOTO(STR_P2_BIN); } ANY { + if (s->utf8_sub) { + php_json_scanner_copy_string(s, 2 - (s->cursor - s->token)); + *(s->pstr++) = (char) (0xc0 | (0xfffd >> 6)); + *(s->pstr++) = (char) (0x80 | (0xfffd & 0x3f)); + s->str_start = s->cursor; + } PHP_JSON_CONDITION_GOTO(STR_P2_BIN); } ANY { PHP_JSON_CONDITION_GOTO(STR_P2_UTF); } diff --git a/ext/json/php_json_scanner.h b/ext/json/php_json_scanner.h index 6f962b4406d6c..9f85d50b32540 100644 --- a/ext/json/php_json_scanner.h +++ b/ext/json/php_json_scanner.h @@ -37,8 +37,7 @@ typedef struct _php_json_scanner { int state; /* condition state */ int options; /* options */ php_json_error_code errcode; /* error type if there is an error */ - int utf8_sub_needed; /* whether utf8 substitution is needed */ - int utf8_sub_len; /* how many extra character is needed (can be negative) */ + int utf8_sub; /* whether utf8 substitution is needed */ } php_json_scanner; From 4b0acfa98f07cb0f4efd86e0054a2517832d4976 Mon Sep 17 00:00:00 2001 From: Jakub Zelenka Date: Sun, 2 Jul 2017 18:12:19 +0100 Subject: [PATCH 5/8] Finish substitution and ignoring invalid UTF-8 in JSON decoder --- ext/json/json_scanner.c | 36 +++++++++----------- ext/json/json_scanner.re | 33 ++++++++---------- ext/json/php_json_scanner.h | 3 +- ext/json/tests/json_decode_invalid_utf8.phpt | 19 +++++++++++ ext/json/tests/json_encode_invalid_utf8.phpt | 2 +- 5 files changed, 53 insertions(+), 40 deletions(-) create mode 100644 ext/json/tests/json_decode_invalid_utf8.phpt diff --git a/ext/json/json_scanner.c b/ext/json/json_scanner.c index 6f6766829b092..1ed970e722c84 100644 --- a/ext/json/json_scanner.c +++ b/ext/json/json_scanner.c @@ -41,7 +41,7 @@ PHP_JSON_CONDITION_GOTO(condition) #define PHP_JSON_CONDITION_GOTO_STR_P2() \ do { \ - if (s->utf8_sub) { \ + if (s->utf8_invalid) { \ PHP_JSON_CONDITION_GOTO(STR_P2_BIN); \ } else { \ PHP_JSON_CONDITION_GOTO(STR_P2_UTF); \ @@ -582,7 +582,7 @@ int php_json_scan(php_json_scanner *s) ++YYCURSOR; { zend_string *str; - size_t len = s->cursor - s->str_start - s->str_esc - 1; + size_t len = s->cursor - s->str_start - s->str_esc - 1 + s->utf8_invalid_count; if (len == 0) { PHP_JSON_CONDITION_SET(JS); ZVAL_EMPTY_STRING(&s->value); @@ -591,14 +591,10 @@ int php_json_scan(php_json_scanner *s) str = zend_string_alloc(len, 0); ZSTR_VAL(str)[len] = '\0'; ZVAL_STR(&s->value, str); - if (s->str_esc) { + if (s->str_esc || s->utf8_invalid) { s->pstr = (php_json_ctype *) Z_STRVAL(s->value); s->cursor = s->str_start; - if (s->utf8_sub) { - PHP_JSON_CONDITION_SET_AND_GOTO(STR_P2_BIN); - } else { - PHP_JSON_CONDITION_SET_AND_GOTO(STR_P2_UTF); - } + PHP_JSON_CONDITION_GOTO_STR_P2(); } else { memcpy(Z_STRVAL(s->value), s->str_start, len); PHP_JSON_CONDITION_SET(JS); @@ -641,11 +637,10 @@ int php_json_scan(php_json_scanner *s) ++YYCURSOR; yy80: { - if (s->options & PHP_JSON_INVALID_UTF8_IGNORE) { - PHP_JSON_CONDITION_GOTO(STR_P1); - } - if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) { - s->utf8_sub = 1; + if (s->options & (PHP_JSON_INVALID_UTF8_IGNORE | PHP_JSON_INVALID_UTF8_SUBSTITUTE)) { + int utf8_addition = (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) ? 4 : 1; + s->utf8_invalid = 1; + s->utf8_invalid_count = utf8_addition - (s->cursor - s->token); PHP_JSON_CONDITION_GOTO(STR_P1); } s->errcode = PHP_JSON_ERROR_UTF8; @@ -1026,8 +1021,7 @@ int php_json_scan(php_json_scanner *s) esc = '\b'; break; case 'f': - esc = '\f'; - break; + esc = '\f'; break; case 'n': esc = '\n'; break; @@ -1055,10 +1049,13 @@ int php_json_scan(php_json_scanner *s) ++YYCURSOR; yy132: { - if (s->utf8_sub) { + if (s->utf8_invalid) { php_json_scanner_copy_string(s, 2 - (s->cursor - s->token)); - *(s->pstr++) = (char) (0xc0 | (0xfffd >> 6)); - *(s->pstr++) = (char) (0x80 | (0xfffd & 0x3f)); + if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) { + *(s->pstr++) = (char) (0xe0 | (0xfffd >> 12)); + *(s->pstr++) = (char) (0x80 | ((0xfffd >> 6) & 0x3f)); + *(s->pstr++) = (char) (0x80 | (0xfffd & 0x3f)); + } s->str_start = s->cursor; } PHP_JSON_CONDITION_GOTO(STR_P2_BIN); @@ -1386,8 +1383,7 @@ int php_json_scan(php_json_scanner *s) esc = '\b'; break; case 'f': - esc = '\f'; - break; + esc = '\f'; break; case 'n': esc = '\n'; break; diff --git a/ext/json/json_scanner.re b/ext/json/json_scanner.re index 03aa37e49f75f..7c879699b6919 100644 --- a/ext/json/json_scanner.re +++ b/ext/json/json_scanner.re @@ -40,7 +40,7 @@ PHP_JSON_CONDITION_GOTO(condition) #define PHP_JSON_CONDITION_GOTO_STR_P2() \ do { \ - if (s->utf8_sub) { \ + if (s->utf8_invalid) { \ PHP_JSON_CONDITION_GOTO(STR_P2_BIN); \ } else { \ PHP_JSON_CONDITION_GOTO(STR_P2_UTF); \ @@ -258,7 +258,7 @@ std: } ["] { zend_string *str; - size_t len = s->cursor - s->str_start - s->str_esc - 1; + size_t len = s->cursor - s->str_start - s->str_esc - 1 + s->utf8_invalid_count; if (len == 0) { PHP_JSON_CONDITION_SET(JS); ZVAL_EMPTY_STRING(&s->value); @@ -267,14 +267,10 @@ std: str = zend_string_alloc(len, 0); ZSTR_VAL(str)[len] = '\0'; ZVAL_STR(&s->value, str); - if (s->str_esc) { + if (s->str_esc || s->utf8_invalid) { s->pstr = (php_json_ctype *) Z_STRVAL(s->value); s->cursor = s->str_start; - if (s->utf8_sub) { - PHP_JSON_CONDITION_SET_AND_GOTO(STR_P2_BIN); - } else { - PHP_JSON_CONDITION_SET_AND_GOTO(STR_P2_UTF); - } + PHP_JSON_CONDITION_GOTO_STR_P2(); } else { memcpy(Z_STRVAL(s->value), s->str_start, len); PHP_JSON_CONDITION_SET(JS); @@ -283,11 +279,10 @@ std: } UTF8 { PHP_JSON_CONDITION_GOTO(STR_P1); } ANY { - if (s->options & PHP_JSON_INVALID_UTF8_IGNORE) { - PHP_JSON_CONDITION_GOTO(STR_P1); - } - if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) { - s->utf8_sub = 1; + if (s->options & (PHP_JSON_INVALID_UTF8_IGNORE | PHP_JSON_INVALID_UTF8_SUBSTITUTE)) { + int utf8_addition = (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) ? 4 : 1; + s->utf8_invalid = 1; + s->utf8_invalid_count = utf8_addition - (s->cursor - s->token); PHP_JSON_CONDITION_GOTO(STR_P1); } s->errcode = PHP_JSON_ERROR_UTF8; @@ -339,8 +334,7 @@ std: esc = '\b'; break; case 'f': - esc = '\f'; - break; + esc = '\f'; break; case 'n': esc = '\n'; break; @@ -370,10 +364,13 @@ std: } UTF8 { PHP_JSON_CONDITION_GOTO(STR_P2_BIN); } ANY { - if (s->utf8_sub) { + if (s->utf8_invalid) { php_json_scanner_copy_string(s, 2 - (s->cursor - s->token)); - *(s->pstr++) = (char) (0xc0 | (0xfffd >> 6)); - *(s->pstr++) = (char) (0x80 | (0xfffd & 0x3f)); + if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) { + *(s->pstr++) = (char) (0xe0 | (0xfffd >> 12)); + *(s->pstr++) = (char) (0x80 | ((0xfffd >> 6) & 0x3f)); + *(s->pstr++) = (char) (0x80 | (0xfffd & 0x3f)); + } s->str_start = s->cursor; } PHP_JSON_CONDITION_GOTO(STR_P2_BIN); diff --git a/ext/json/php_json_scanner.h b/ext/json/php_json_scanner.h index 9f85d50b32540..28cef7ee872ff 100644 --- a/ext/json/php_json_scanner.h +++ b/ext/json/php_json_scanner.h @@ -37,7 +37,8 @@ typedef struct _php_json_scanner { int state; /* condition state */ int options; /* options */ php_json_error_code errcode; /* error type if there is an error */ - int utf8_sub; /* whether utf8 substitution is needed */ + int utf8_invalid; /* whether utf8 is invalid */ + int utf8_invalid_count; /* number of extra character for invalid utf8 */ } php_json_scanner; diff --git a/ext/json/tests/json_decode_invalid_utf8.phpt b/ext/json/tests/json_decode_invalid_utf8.phpt new file mode 100644 index 0000000000000..f581198c9f86b --- /dev/null +++ b/ext/json/tests/json_decode_invalid_utf8.phpt @@ -0,0 +1,19 @@ +--TEST-- +json_decode() invalid UTF8 +--SKIPIF-- + +--FILE-- + +--EXPECTF-- +NULL +string(3) "bar" +string(12) "efbfbd626172" +Done diff --git a/ext/json/tests/json_encode_invalid_utf8.phpt b/ext/json/tests/json_encode_invalid_utf8.phpt index 888b6ad7e8754..12e0d98b1acae 100644 --- a/ext/json/tests/json_encode_invalid_utf8.phpt +++ b/ext/json/tests/json_encode_invalid_utf8.phpt @@ -1,5 +1,5 @@ --TEST-- -json_encode() invalide UTF8 +json_encode() invalid UTF8 --SKIPIF-- Date: Thu, 6 Jul 2017 19:32:18 +0100 Subject: [PATCH 6/8] Add failing tests to cover more issues for JSON invalid UTF-8 --- ext/json/tests/json_decode_invalid_utf8.phpt | 20 +++++++++----- ext/json/tests/json_encode_invalid_utf8.phpt | 28 +++++++++++++------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/ext/json/tests/json_decode_invalid_utf8.phpt b/ext/json/tests/json_decode_invalid_utf8.phpt index f581198c9f86b..660f4d83fe821 100644 --- a/ext/json/tests/json_decode_invalid_utf8.phpt +++ b/ext/json/tests/json_decode_invalid_utf8.phpt @@ -6,14 +6,20 @@ if (!extension_loaded("json")) print "skip"; ?> --FILE-- ---EXPECTF-- +--EXPECT-- NULL -string(3) "bar" -string(12) "efbfbd626172" +string(2) "ab" +string(12) "efbfbd62" +NULL +string(2) "ab" +string(12) "efbfbd62" Done diff --git a/ext/json/tests/json_encode_invalid_utf8.phpt b/ext/json/tests/json_encode_invalid_utf8.phpt index 12e0d98b1acae..de843999852d7 100644 --- a/ext/json/tests/json_encode_invalid_utf8.phpt +++ b/ext/json/tests/json_encode_invalid_utf8.phpt @@ -6,18 +6,26 @@ if (!extension_loaded("json")) print "skip"; ?> --FILE-- ---EXPECTF-- +--EXPECT-- bool(false) -string(8) ""foobar"" -string(14) ""foo\ufffdbar"" +string(2) "ab" +string(8) "a\ufffdb" bool(false) -string(9) "%s" +string(12) "2261fffd6222" +bool(false) +string(2) "ab" +string(8) "a\ufffdb" +bool(false) +string(12) "2261fffd6222" Done From c85df7f482b8e3de7e1502fb1b5df914c25f0831 Mon Sep 17 00:00:00 2001 From: Jakub Zelenka Date: Thu, 6 Jul 2017 20:15:45 +0100 Subject: [PATCH 7/8] Fix and improve replacing invalid UTF-8 in json_encode --- ext/json/json_encoder.c | 8 +++++++- ext/json/tests/json_encode_invalid_utf8.phpt | 16 ++++++++-------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/ext/json/json_encoder.c b/ext/json/json_encoder.c index 8e1e9566e174a..50dcc020521a8 100644 --- a/ext/json/json_encoder.c +++ b/ext/json/json_encoder.c @@ -285,6 +285,7 @@ static int php_json_escape_string( do { us = (unsigned char)s[pos]; if (us >= 0x80) { + int utf8_sub = 0; size_t prev_pos = pos; us = php_next_utf8_char((unsigned char *)s, len, &pos, &status); @@ -297,6 +298,7 @@ static int php_json_escape_string( } else if (options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) { /* Use Unicode character 'REPLACEMENT CHARACTER' (U+FFFD) */ us = 0xfffd; + utf8_sub = 1; } else { if (buf->s) { ZSTR_LEN(buf->s) = checkpoint; @@ -315,7 +317,11 @@ static int php_json_escape_string( if ((options & PHP_JSON_UNESCAPED_UNICODE) && ((options & PHP_JSON_UNESCAPED_LINE_TERMINATORS) || us < 0x2028 || us > 0x2029)) { - smart_str_appendl(buf, s + prev_pos, pos - prev_pos); + if (utf8_sub) { + smart_str_appendl(buf, "\xef\xbf\xbd", 3); + } else { + smart_str_appendl(buf, s + prev_pos, pos - prev_pos); + } continue; } /* From http://en.wikipedia.org/wiki/UTF16 */ diff --git a/ext/json/tests/json_encode_invalid_utf8.phpt b/ext/json/tests/json_encode_invalid_utf8.phpt index de843999852d7..d7ec58a973f34 100644 --- a/ext/json/tests/json_encode_invalid_utf8.phpt +++ b/ext/json/tests/json_encode_invalid_utf8.phpt @@ -13,19 +13,19 @@ function json_encode_invalid_utf8($str) { var_dump(json_encode($str, JSON_UNESCAPED_UNICODE)); var_dump(bin2hex(json_encode($str, JSON_UNESCAPED_UNICODE | JSON_INVALID_UTF8_SUBSTITUTE))); } -json_encode_invalid_utf8("a\xb0b"); -json_encode_invalid_utf8("a\xd0\xf2b"); +json_encode_invalid_utf8("\x61\xb0\x62"); +json_encode_invalid_utf8("\x61\xf0\x80\x80\x41"); echo "Done\n"; ?> --EXPECT-- bool(false) -string(2) "ab" -string(8) "a\ufffdb" +string(4) ""ab"" +string(10) ""a\ufffdb"" bool(false) -string(12) "2261fffd6222" +string(14) "2261efbfbd6222" bool(false) -string(2) "ab" -string(8) "a\ufffdb" +string(4) ""aA"" +string(10) ""a\ufffdA"" bool(false) -string(12) "2261fffd6222" +string(14) "2261efbfbd4122" Done From 554cdc051aa055c138ce07889934d60d53e912ab Mon Sep 17 00:00:00 2001 From: Jakub Zelenka Date: Sun, 9 Jul 2017 19:17:34 +0100 Subject: [PATCH 8/8] Fix decoder for handling invalid UTF-8 --- ext/json/json_scanner.c | 6 +++--- ext/json/json_scanner.re | 6 +++--- ext/json/tests/json_decode_invalid_utf8.phpt | 8 ++++++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/ext/json/json_scanner.c b/ext/json/json_scanner.c index 1ed970e722c84..462a99d0136c5 100644 --- a/ext/json/json_scanner.c +++ b/ext/json/json_scanner.c @@ -638,9 +638,9 @@ int php_json_scan(php_json_scanner *s) yy80: { if (s->options & (PHP_JSON_INVALID_UTF8_IGNORE | PHP_JSON_INVALID_UTF8_SUBSTITUTE)) { - int utf8_addition = (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) ? 4 : 1; + int utf8_addition = (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) ? 3 : 0; s->utf8_invalid = 1; - s->utf8_invalid_count = utf8_addition - (s->cursor - s->token); + s->utf8_invalid_count += utf8_addition - 1; PHP_JSON_CONDITION_GOTO(STR_P1); } s->errcode = PHP_JSON_ERROR_UTF8; @@ -1050,7 +1050,7 @@ int php_json_scan(php_json_scanner *s) yy132: { if (s->utf8_invalid) { - php_json_scanner_copy_string(s, 2 - (s->cursor - s->token)); + PHP_JSON_SCANNER_COPY_ESC(); if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) { *(s->pstr++) = (char) (0xe0 | (0xfffd >> 12)); *(s->pstr++) = (char) (0x80 | ((0xfffd >> 6) & 0x3f)); diff --git a/ext/json/json_scanner.re b/ext/json/json_scanner.re index 7c879699b6919..d26e035481100 100644 --- a/ext/json/json_scanner.re +++ b/ext/json/json_scanner.re @@ -280,9 +280,9 @@ std: UTF8 { PHP_JSON_CONDITION_GOTO(STR_P1); } ANY { if (s->options & (PHP_JSON_INVALID_UTF8_IGNORE | PHP_JSON_INVALID_UTF8_SUBSTITUTE)) { - int utf8_addition = (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) ? 4 : 1; + int utf8_addition = (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) ? 3 : 0; s->utf8_invalid = 1; - s->utf8_invalid_count = utf8_addition - (s->cursor - s->token); + s->utf8_invalid_count += utf8_addition - 1; PHP_JSON_CONDITION_GOTO(STR_P1); } s->errcode = PHP_JSON_ERROR_UTF8; @@ -365,7 +365,7 @@ std: UTF8 { PHP_JSON_CONDITION_GOTO(STR_P2_BIN); } ANY { if (s->utf8_invalid) { - php_json_scanner_copy_string(s, 2 - (s->cursor - s->token)); + PHP_JSON_SCANNER_COPY_ESC(); if (s->options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) { *(s->pstr++) = (char) (0xe0 | (0xfffd >> 12)); *(s->pstr++) = (char) (0x80 | ((0xfffd >> 6) & 0x3f)); diff --git a/ext/json/tests/json_decode_invalid_utf8.phpt b/ext/json/tests/json_decode_invalid_utf8.phpt index 660f4d83fe821..725fe9be965bc 100644 --- a/ext/json/tests/json_decode_invalid_utf8.phpt +++ b/ext/json/tests/json_decode_invalid_utf8.phpt @@ -13,13 +13,17 @@ function json_decode_invalid_utf8($str) { } json_decode_invalid_utf8("\"a\xb0b\""); json_decode_invalid_utf8("\"a\xd0\xf2b\""); +json_decode_invalid_utf8("\"\x61\xf0\x80\x80\x41\""); echo "Done\n"; ?> --EXPECT-- NULL string(2) "ab" -string(12) "efbfbd62" +string(10) "61efbfbd62" NULL string(2) "ab" -string(12) "efbfbd62" +string(16) "61efbfbdefbfbd62" +NULL +string(2) "aA" +string(22) "61efbfbdefbfbdefbfbd41" Done