diff --git a/spec/09-lexical-structure.md b/spec/09-lexical-structure.md index 71f9b7a3..8c060467 100644 --- a/spec/09-lexical-structure.md +++ b/spec/09-lexical-structure.md @@ -542,6 +542,7 @@ A single-quoted string literal is always a constant expression. dq-simple-escape-sequence dq-octal-escape-sequence dq-hexadecimal-escape-sequence + dq-unicode-escape-sequence dq-simple-escape-sequence:: one of \" \\ \$ \e \f \n \r \t \v @@ -554,6 +555,13 @@ A single-quoted string literal is always a constant expression. dq-hexadecimal-escape-sequence:: \x hexadecimal-digit hexadecimal-digitopt \X hexadecimal-digit hexadecimal-digitopt + + dq-unicode-escape-sequence:: + \u{ codepoint-digits } + + codepoint-digits:: + hexadecimal-digit + hexadecimal-digit codepoint-digits *octal-digit* and *hexadecimal-digit* are defined in [§§](#integer-literals). @@ -586,13 +594,28 @@ Escape sequence | Character name | Unicode character \v | Vertical Tab | U+000B \ooo | 1–3-digit octal digit value ooo \xhh or \Xhh | 1–2-digit hexadecimal digit value hh +\u{xxxxxx} | UTF-8 encoding of Unicode codepoint U+xxxxxx | U+xxxxxx Within a double-quoted string literal, except when recognized as the start of an escape sequence, a backslash (\\) is retained verbatim. Within a double-quoted string literal a dollar ($) character not escaped by a backslash (\\) is handled using a variable substitution rules -described below. +described below. + +The `\u{xxxxxx}` escape sequence produces the UTF-8 encoding of the Unicode +codepoint with the hexadecimal number specified within the curly braces. +Implementations MUST NOT allow Unicode codepoints beyond U+10FFFF as this is +outside the range UTF-8 can encode (see +[RFC 3629](http://tools.ietf.org/html/rfc3629#section-3)). If a codepoint +larger than U+10FFFF is specified, implementations MUST error. +Implementations MUST pass through `\u` verbatim and not interpret it as an +escape sequence if it is not followed by an opening `{`, but if it is, +implementations MUST produce an error if there is no terminating `}` or the +contents are not a valid codepoint. Implementations MUST support leading zeroes, +but MUST NOT support leading or trailing whitespace for the codepoint between +the opening and terminating braces. Implementations MUST allow Unicode +codepoints that are not Unicode scalar values, such as high and low surrogates. **Variable substitution** @@ -695,6 +718,7 @@ echo "\$myC->p1 = >$myC->p1<\n"; // → $myC->p1 = >2< hd-simple-escape-sequence dq-octal-escape-sequence dq-hexadecimal-escape-sequence + dq-unicode-escape-sequence hd-simple-escape-sequence:: one of \\ \$ \e \f \n \r \t \v diff --git a/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php new file mode 100644 index 00000000..73cbe0ad --- /dev/null +++ b/tests/lexical_structure/unicode_string_escape_sequence/unicode_escape.php @@ -0,0 +1,8 @@ +