From 5ff52134110a1ea3866a5128102da7715385151b Mon Sep 17 00:00:00 2001 From: Andrea Faulds Date: Mon, 24 Nov 2014 20:58:13 +0000 Subject: [PATCH 1/8] Specify Unicode escape sequence --- spec/09-lexical-structure.md | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/spec/09-lexical-structure.md b/spec/09-lexical-structure.md index 71f9b7a3..d9712397 100644 --- a/spec/09-lexical-structure.md +++ b/spec/09-lexical-structure.md @@ -542,6 +542,7 @@ A single-quoted string literal is always a constant expression. dq-simple-escape-sequence dq-octal-escape-sequence dq-hexadecimal-escape-sequence + dq-unicode-escape-sequence dq-simple-escape-sequence:: one of \" \\ \$ \e \f \n \r \t \v @@ -554,6 +555,13 @@ A single-quoted string literal is always a constant expression. dq-hexadecimal-escape-sequence:: \x hexadecimal-digit hexadecimal-digitopt \X hexadecimal-digit hexadecimal-digitopt + + dq-unicode-escape-sequence:: + \u{ codepoint-digits } + + codepoint-digits:: + hexadecimal-digit + codepoint-digits codepoint-digits *octal-digit* and *hexadecimal-digit* are defined in [§§](#integer-literals). @@ -586,13 +594,21 @@ Escape sequence | Character name | Unicode character \v | Vertical Tab | U+000B \ooo | 1–3-digit octal digit value ooo \xhh or \Xhh | 1–2-digit hexadecimal digit value hh +\u{xxxxxx} | UTF-8 encoding of Unicode codepoint U+xxxxxx | U+xxxxxx Within a double-quoted string literal, except when recognized as the start of an escape sequence, a backslash (\\) is retained verbatim. Within a double-quoted string literal a dollar ($) character not escaped by a backslash (\\) is handled using a variable substitution rules -described below. +described below. + +The `\u{xxxxxx}` escape sequence produces the UTF-8 encoding of the Unicode +codepoint with the hexadecimal number specified within the curly braces. +Implementations MUST NOT allow Unicode codepoints beyond U+10FFFF as this is +outside the range UTF-8 can encode (see +[RFC 3629](http://tools.ietf.org/html/rfc3629#section-3)). If a codepoint +larger than U+10FFFF is specified, implementations MUST error. **Variable substitution** @@ -695,6 +711,7 @@ echo "\$myC->p1 = >$myC->p1<\n"; // → $myC->p1 = >2< hd-simple-escape-sequence dq-octal-escape-sequence dq-hexadecimal-escape-sequence + dq-unicode-escape-sequence hd-simple-escape-sequence:: one of \\ \$ \e \f \n \r \t \v From f14d5ad30c803c94658fbbb17decf901e4e8c24c Mon Sep 17 00:00:00 2001 From: Andrea Faulds Date: Mon, 24 Nov 2014 21:04:33 +0000 Subject: [PATCH 2/8] Add tests --- .../unicode_string_escape_sequence/unicode_escape.php | 7 +++++++ .../unicode_escape.php.expect | 5 +++++ .../unicode_escape_empty.php | 3 +++ .../unicode_escape_empty.php.expectf | 1 + .../unicode_escape_large_codepoint.php | 3 +++ .../unicode_escape_large_codepoint.php.expectf | 1 + .../unicode_escape_whitespace.php | 3 +++ .../unicode_escape_whitespace.php.expectf | 1 + 8 files changed, 24 insertions(+) create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php.expect create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_empty.php create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_empty.php.expectf create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_large_codepoint.php create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_large_codepoint.php.expectf create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_whitespace.php create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_whitespace.php.expectf diff --git a/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php new file mode 100644 index 00000000..17f7c9c3 --- /dev/null +++ b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php @@ -0,0 +1,7 @@ + Date: Mon, 24 Nov 2014 22:03:05 +0000 Subject: [PATCH 3/8] corrected syntax --- spec/09-lexical-structure.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/09-lexical-structure.md b/spec/09-lexical-structure.md index d9712397..74fb5e81 100644 --- a/spec/09-lexical-structure.md +++ b/spec/09-lexical-structure.md @@ -561,7 +561,7 @@ A single-quoted string literal is always a constant expression. codepoint-digits:: hexadecimal-digit - codepoint-digits codepoint-digits + hexadecimal-digit codepoint-digits *octal-digit* and *hexadecimal-digit* are defined in [§§](#integer-literals). From 0491c2a716e484929b446f1e5e30247648ac2f58 Mon Sep 17 00:00:00 2001 From: Andrea Faulds Date: Fri, 28 Nov 2014 16:39:28 +0000 Subject: [PATCH 4/8] Test leading zeroes --- .../unicode_string_escape_sequence/unicode_escape.php | 1 + .../unicode_string_escape_sequence/unicode_escape.php.expect | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php index 17f7c9c3..73cbe0ad 100644 --- a/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php +++ b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php @@ -5,3 +5,4 @@ var_dump("\u{ff}"); // case-insensitive var_dump("\u{2603}"); // Unicode snowman var_dump("\u{1F602}"); // FACE WITH TEARS OF JOY emoji +var_dump("\u{0000001F602}"); // Leading zeroes permitted diff --git a/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php.expect b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php.expect index 7efd9649..7287ea32 100644 --- a/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php.expect +++ b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php.expect @@ -3,3 +3,4 @@ string(2) "ÿ" string(2) "ÿ" string(3) "☃" string(4) "😂" +string(4) "😂" From 9d2db15a8a17b25e688ac0778c8abe679c56df34 Mon Sep 17 00:00:00 2001 From: Andrea Faulds Date: Wed, 10 Dec 2014 19:50:39 +0000 Subject: [PATCH 5/8] Don't error unless there's an opening { for Unicode escapes (JSON compat) --- spec/09-lexical-structure.md | 6 ++++++ .../unicode_escape_incomplete.php | 3 +++ .../unicode_escape_incomplete.php.expectf | 1 + .../unicode_escape_legacy.php | 6 ++++++ .../unicode_escape_legacy.php.expectf | 3 +++ 5 files changed, 19 insertions(+) create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_incomplete.php create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_incomplete.php.expectf create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_legacy.php create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_legacy.php.expectf diff --git a/spec/09-lexical-structure.md b/spec/09-lexical-structure.md index 74fb5e81..94597d7b 100644 --- a/spec/09-lexical-structure.md +++ b/spec/09-lexical-structure.md @@ -609,6 +609,12 @@ Implementations MUST NOT allow Unicode codepoints beyond U+10FFFF as this is outside the range UTF-8 can encode (see [RFC 3629](http://tools.ietf.org/html/rfc3629#section-3)). If a codepoint larger than U+10FFFF is specified, implementations MUST error. +Implementations MUST pass through `\u` verbatim and not interpret it as an +escape sequence if it is not followed by an opening `{`, but if it is, +implementations MUST produce an error if there is no terminating `}` or the +contents are not a valid codepoint. Implementations MUST support leading zeroes, +but MUST NOT support leading or trailing whitespace for the codepoint between +the opening and terminating braces. **Variable substitution** diff --git a/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_incomplete.php b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_incomplete.php new file mode 100644 index 00000000..1cea0229 --- /dev/null +++ b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_incomplete.php @@ -0,0 +1,3 @@ + Date: Tue, 16 Dec 2014 12:13:38 +0000 Subject: [PATCH 6/8] Allow high and low surrogates --- spec/09-lexical-structure.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spec/09-lexical-structure.md b/spec/09-lexical-structure.md index 94597d7b..8c060467 100644 --- a/spec/09-lexical-structure.md +++ b/spec/09-lexical-structure.md @@ -614,7 +614,8 @@ escape sequence if it is not followed by an opening `{`, but if it is, implementations MUST produce an error if there is no terminating `}` or the contents are not a valid codepoint. Implementations MUST support leading zeroes, but MUST NOT support leading or trailing whitespace for the codepoint between -the opening and terminating braces. +the opening and terminating braces. Implementations MUST allow Unicode +codepoints that are not Unicode scalar values, such as high and low surrogates. **Variable substitution** From ff0a8acec67ad7b5dc7b6045ed718e25faf2d6f3 Mon Sep 17 00:00:00 2001 From: Andrea Faulds Date: Wed, 17 Dec 2014 20:16:24 +0000 Subject: [PATCH 7/8] Add tests to ensure + and - are invalid --- .../unicode_string_escape_sequence/unicode_escape_sign.php | 3 +++ .../unicode_escape_sign.php.expectf | 1 + .../unicode_string_escape_sequence/unicode_escape_sign2.php | 3 +++ .../unicode_escape_sign2.php.expectf | 1 + 4 files changed, 8 insertions(+) create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_sign.php create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_sign.php.expectf create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_sign2.php create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_sign2.php.expectf diff --git a/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_sign.php b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_sign.php new file mode 100644 index 00000000..7e25968d --- /dev/null +++ b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_sign.php @@ -0,0 +1,3 @@ + Date: Wed, 17 Dec 2014 20:23:03 +0000 Subject: [PATCH 8/8] Add test for surrogate half encoding --- .../unicode_escape_surrogates.php | 8 ++++++++ .../unicode_escape_surrogates.php.expect | 3 +++ 2 files changed, 11 insertions(+) create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_surrogates.php create mode 100644 tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_surrogates.php.expect diff --git a/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_surrogates.php b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_surrogates.php new file mode 100644 index 00000000..6b3a0738 --- /dev/null +++ b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape_surrogates.php @@ -0,0 +1,8 @@ +