Skip to content
Permalink
Browse files

Removed U+180E MONGOLIAN VOWEL SEPARATOR from Whitespace classification.

This character was recategorized from Zs to Cf in Unicode 6.3, and remains categorized as such in Unicode 9.0 (current target version). The current version of ECMAScript requires that only characters classed as Zs in the target version of Unicode be recognized as Whitespace. This change is now reflected in Test262 so making this change will improve our Test262 score rather than regress it.

Additionally, update comments about location of UnicodeData.txt and about the definition of Whitespace characters.

See: tc39/ecma262#300

See: tc39/test262@3a5a09e

See: mathiasbynens/regexpu-core@9b10d2a

Fixes #2120
  • Loading branch information...
dilijev committed Nov 30, 2016
1 parent 375ed38 commit 7c097b698de1e400286f9b957597b2a81fc6f80b
@@ -67,7 +67,7 @@ We then compress the above map such that:
- the result is in strictly increasing range order
Using gawk the above is:
gawk -f equiv.gawk http://www.unicode.org/Public/UNIDATA/UnicodeData.txt | gawk -f table.gawk
gawk -f equiv.gawk http://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt | gawk -f table.gawk
Where equiv.gawk is:
----------------------------------------------------------------------
@@ -206,7 +206,7 @@ END {
*/

// For case-folding entries, version 8.0.0 of CaseFolding.txt located at [1] was used.
// [1] ftp://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt
// [1] http://www.unicode.org/Public/8.0.0/ucd/CaseFolding.txt
static const Transform transforms[] =
{
1, MappingSource::UnicodeData, 0x0041, 0x004a, 0, 32, 32, 32,
@@ -106,6 +106,7 @@ echo(" };");
----------------------------------------------------------------------
*/

// Character classes represented as a bit vector for each character.
const uint8 ASCIIChars::classes[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x06, 0x04, 0x04, 0x06, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -124,6 +125,8 @@ echo(" };");
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};

// Numeric values of ASCII characters interpreted as hex digits (applies to [0-9a-fA-F], all others are 0x00).
const uint8 ASCIIChars::values[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -155,7 +158,7 @@ echo(" };");

/*
To get the whitespaces string, run:
gawk -f spaces.gawk http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
gawk -f spaces.gawk http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt
where spaces.gawk is
----------------------------------------------------------------------
BEGIN {
@@ -180,13 +183,14 @@ BEGIN {
END {
str = sprintf("%s\\x%04x\\x%04x", str, start, last);
print str;
}----------------------------------------------------------------------
}
----------------------------------------------------------------------
*/

const int StandardChars<char16>::numDigitPairs = 1;
const char16* const StandardChars<char16>::digitStr = _u("09");
const int StandardChars<char16>::numWhitespacePairs = 11;
const char16* const StandardChars<char16>::whitespaceStr = _u("\x0009\x000d\x0020\x0020\x00a0\x00a0\x1680\x1680\x180e\x180e\x2000\x200a\x2028\x2029\x202f\x202f\x205f\x205f\x3000\x3000\xfeff\xfeff");
const int StandardChars<char16>::numWhitespacePairs = 10;
const char16* const StandardChars<char16>::whitespaceStr = _u("\x0009\x000d\x0020\x0020\x00a0\x00a0\x1680\x1680\x2000\x200a\x2028\x2029\x202f\x202f\x205f\x205f\x3000\x3000\xfeff\xfeff");
const int StandardChars<char16>::numWordPairs = 4;
const char16* const StandardChars<char16>::wordStr = _u("09AZ__az");
const int StandardChars<char16>::numNewlinePairs = 3;
@@ -279,7 +279,7 @@ namespace UnifiedRegex
if (CTU(c) < ASCIIChars::NumChars)
return ASCIIChars::IsWhitespace(ASCIIChars::UTC(CTU(c)));
else
return CTU(c) == 0x1680 || CTU(c) == 0x180e || (CTU(c) >= 0x2000 && CTU(c) <= 0x200a) ||
return CTU(c) == 0x1680 || (CTU(c) >= 0x2000 && CTU(c) <= 0x200a) ||
CTU(c) == 0x2028 || CTU(c) == 0x2029 || CTU(c) == 0x202f || CTU(c) == 0x205f ||
CTU(c) == 0x3000 || CTU(c) == 0xfeff;
}
@@ -14,17 +14,28 @@

namespace Js
{
// White Space characters are defined in ES6 Section 11.2
// There are 26 white space characters we need to correctly class:
//0x0009
//0x000a
//0x000b
//0x000c
//0x000d
//0x0020
//0x00a0
// White Space characters are defined in ES 2017 Section 11.2 #sec-white-space
// There are 25 white space characters we need to correctly class.
// - 6 of these are explicitly specified in ES 2017 Section 11.2 #sec-white-space
// - 15 of these are Unicode category "Zs" ("Space_Separator") and not explicitly specified above.
// - Note: In total, 17 of these are Unicode category "Zs".
// - 4 of these are actually LineTerminator characters.
// - Note: for various reasons it is convenient to group LineTerminator with Whitespace
// in the definition of IsWhiteSpaceCharacter.
// This does not cause problems because of the syntactic nature of LineTerminators
// and their meaning of ending a line in RegExp.
// - See: #sec-string.prototype.trim "The definition of white space is the union of WhiteSpace and LineTerminator."
// Note: ES intentionally excludes characters which have Unicode property "White_Space" but which are not "Zs".
// See http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt for character classes.
// The 25 white space characters are:
//0x0009 // <TAB>
//0x000a // <LF> LineTerminator (LINE FEED)
//0x000b // <VT>
//0x000c // <FF>
//0x000d // <CR> LineTerminator (CARRIAGE RETURN)
//0x0020 // <SP>
//0x00a0 // <NBSP>
//0x1680
//0x180e
//0x2000
//0x2001
//0x2002
@@ -36,18 +47,18 @@ namespace Js
//0x2008
//0x2009
//0x200a
//0x2028
//0x2029
//0x2028 // <LS> LineTerminator (LINE SEPARATOR)
//0x2029 // <PS> LineTerminator (PARAGRAPH SEPARATOR)
//0x202f
//0x205f
//0x3000
//0xfeff
//0xfeff // <ZWNBSP>
bool IsWhiteSpaceCharacter(char16 ch)
{
return ch >= 0x9 &&
(ch <= 0xd ||
(ch <= 0x200a &&
(ch >= 0x2000 || ch == 0x20 || ch == 0xa0 || ch == 0x1680 || ch == 0x180e)
(ch >= 0x2000 || ch == 0x20 || ch == 0xa0 || ch == 0x1680)
) ||
(ch >= 0x2028 &&
(ch <= 0x2029 || ch == 0x202f || ch == 0x205f || ch == 0x3000 || ch == 0xfeff)
@@ -7,7 +7,6 @@ start
\u0020
\u00a0
\u1680
\u180e
\u2000
\u2001
\u2002
@@ -16,7 +16,6 @@ var whitespace_characters = [
'\u0020',
'\u00a0',
'\u1680',
'\u180e',
'\u2000',
'\u2001',
'\u2002',

0 comments on commit 7c097b6

Please sign in to comment.
You can’t perform that action at this time.