Skip to content
Permalink
Browse files

[MERGE #2121 @dilijev] Removed U+180E MONGOLIAN VOWEL SEPARATOR from …

…Whitespace classification.

Merge pull request #2121 from dilijev:regex-ws

This character was recategorized from Zs to Cf in Unicode 6.3, and remains categorized as such in Unicode 9.0 (current target version). The current version of ECMAScript requires that only characters classed as Zs in the target version of Unicode be recognized as Whitespace. This change is now reflected in Test262 so making this change will improve our Test262 score rather than regress it.

Additionally, update comments about location of UnicodeData.txt and about the definition of Whitespace characters.

See: tc39/ecma262#300

See: tc39/test262@3a5a09e

See: mathiasbynens/regexpu-core@9b10d2a

Fixes #2120
  • Loading branch information...
dilijev committed Dec 7, 2016
2 parents cecde1f + 7c097b6 commit 04074af9a45842ae660b7a68ca2be1b91ce6419f
@@ -67,7 +67,7 @@ We then compress the above map such that:
- the result is in strictly increasing range order
Using gawk the above is:
gawk -f equiv.gawk http://www.unicode.org/Public/UNIDATA/UnicodeData.txt | gawk -f table.gawk
gawk -f equiv.gawk http://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt | gawk -f table.gawk
Where equiv.gawk is:
----------------------------------------------------------------------
@@ -206,7 +206,7 @@ END {
*/

// For case-folding entries, version 8.0.0 of CaseFolding.txt located at [1] was used.
// [1] ftp://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt
// [1] http://www.unicode.org/Public/8.0.0/ucd/CaseFolding.txt
static const Transform transforms[] =
{
1, MappingSource::UnicodeData, 0x0041, 0x004a, 0, 32, 32, 32,
@@ -106,6 +106,7 @@ echo(" };");
----------------------------------------------------------------------
*/

// Character classes represented as a bit vector for each character.
const uint8 ASCIIChars::classes[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x06, 0x04, 0x04, 0x06, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -124,6 +125,8 @@ echo(" };");
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};

// Numeric values of ASCII characters interpreted as hex digits (applies to [0-9a-fA-F], all others are 0x00).
const uint8 ASCIIChars::values[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -155,7 +158,7 @@ echo(" };");

/*
To get the whitespaces string, run:
gawk -f spaces.gawk http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
gawk -f spaces.gawk http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt
where spaces.gawk is
----------------------------------------------------------------------
BEGIN {
@@ -180,13 +183,14 @@ BEGIN {
END {
str = sprintf("%s\\x%04x\\x%04x", str, start, last);
print str;
}----------------------------------------------------------------------
}
----------------------------------------------------------------------
*/

const int StandardChars<char16>::numDigitPairs = 1;
const char16* const StandardChars<char16>::digitStr = _u("09");
const int StandardChars<char16>::numWhitespacePairs = 11;
const char16* const StandardChars<char16>::whitespaceStr = _u("\x0009\x000d\x0020\x0020\x00a0\x00a0\x1680\x1680\x180e\x180e\x2000\x200a\x2028\x2029\x202f\x202f\x205f\x205f\x3000\x3000\xfeff\xfeff");
const int StandardChars<char16>::numWhitespacePairs = 10;
const char16* const StandardChars<char16>::whitespaceStr = _u("\x0009\x000d\x0020\x0020\x00a0\x00a0\x1680\x1680\x2000\x200a\x2028\x2029\x202f\x202f\x205f\x205f\x3000\x3000\xfeff\xfeff");
const int StandardChars<char16>::numWordPairs = 4;
const char16* const StandardChars<char16>::wordStr = _u("09AZ__az");
const int StandardChars<char16>::numNewlinePairs = 3;
@@ -279,7 +279,7 @@ namespace UnifiedRegex
if (CTU(c) < ASCIIChars::NumChars)
return ASCIIChars::IsWhitespace(ASCIIChars::UTC(CTU(c)));
else
return CTU(c) == 0x1680 || CTU(c) == 0x180e || (CTU(c) >= 0x2000 && CTU(c) <= 0x200a) ||
return CTU(c) == 0x1680 || (CTU(c) >= 0x2000 && CTU(c) <= 0x200a) ||
CTU(c) == 0x2028 || CTU(c) == 0x2029 || CTU(c) == 0x202f || CTU(c) == 0x205f ||
CTU(c) == 0x3000 || CTU(c) == 0xfeff;
}
@@ -14,17 +14,28 @@

namespace Js
{
// White Space characters are defined in ES6 Section 11.2
// There are 26 white space characters we need to correctly class:
//0x0009
//0x000a
//0x000b
//0x000c
//0x000d
//0x0020
//0x00a0
// White Space characters are defined in ES 2017 Section 11.2 #sec-white-space
// There are 25 white space characters we need to correctly class.
// - 6 of these are explicitly specified in ES 2017 Section 11.2 #sec-white-space
// - 15 of these are Unicode category "Zs" ("Space_Separator") and not explicitly specified above.
// - Note: In total, 17 of these are Unicode category "Zs".
// - 4 of these are actually LineTerminator characters.
// - Note: for various reasons it is convenient to group LineTerminator with Whitespace
// in the definition of IsWhiteSpaceCharacter.
// This does not cause problems because of the syntactic nature of LineTerminators
// and their meaning of ending a line in RegExp.
// - See: #sec-string.prototype.trim "The definition of white space is the union of WhiteSpace and LineTerminator."
// Note: ES intentionally excludes characters which have Unicode property "White_Space" but which are not "Zs".
// See http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt for character classes.
// The 25 white space characters are:
//0x0009 // <TAB>
//0x000a // <LF> LineTerminator (LINE FEED)
//0x000b // <VT>
//0x000c // <FF>
//0x000d // <CR> LineTerminator (CARRIAGE RETURN)
//0x0020 // <SP>
//0x00a0 // <NBSP>
//0x1680
//0x180e
//0x2000
//0x2001
//0x2002
@@ -36,18 +47,18 @@ namespace Js
//0x2008
//0x2009
//0x200a
//0x2028
//0x2029
//0x2028 // <LS> LineTerminator (LINE SEPARATOR)
//0x2029 // <PS> LineTerminator (PARAGRAPH SEPARATOR)
//0x202f
//0x205f
//0x3000
//0xfeff
//0xfeff // <ZWNBSP>
bool IsWhiteSpaceCharacter(char16 ch)
{
return ch >= 0x9 &&
(ch <= 0xd ||
(ch <= 0x200a &&
(ch >= 0x2000 || ch == 0x20 || ch == 0xa0 || ch == 0x1680 || ch == 0x180e)
(ch >= 0x2000 || ch == 0x20 || ch == 0xa0 || ch == 0x1680)
) ||
(ch >= 0x2028 &&
(ch <= 0x2029 || ch == 0x202f || ch == 0x205f || ch == 0x3000 || ch == 0xfeff)
@@ -7,7 +7,6 @@ start
\u0020
\u00a0
\u1680
\u180e
\u2000
\u2001
\u2002
@@ -16,7 +16,6 @@ var whitespace_characters = [
'\u0020',
'\u00a0',
'\u1680',
'\u180e',
'\u2000',
'\u2001',
'\u2002',

0 comments on commit 04074af

Please sign in to comment.
You can’t perform that action at this time.