diff --git a/lib/internal/readline.js b/lib/internal/readline.js index dbe8775dba3aac..60fe946560aaaa 100644 --- a/lib/internal/readline.js +++ b/lib/internal/readline.js @@ -1,103 +1,117 @@ 'use strict'; -// Regexes used for ansi escape code splitting +// Regex used for ansi escape code splitting // eslint-disable-next-line no-control-regex -const metaKeyCodeReAnywhere = /(?:\x1b)([a-zA-Z0-9])/; -const functionKeyCodeReAnywhere = new RegExp('(?:\x1b+)(O|N|\\[|\\[\\[)(?:' + [ - '(\\d+)(?:;(\\d+))?([~^$])', - '(?:M([@ #!a`])(.)(.))', // mouse - '(?:1;)?(\\d+)?([a-zA-Z])' -].join('|') + ')'); +// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js +// License: MIT, authors: @sindresorhus, Qix-, and arjunmehta +// Matches all ansi escape code sequences in a string +const ansi = + /[\u001b\u009b][[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-ORZcf-nqry=><]/g; module.exports = { emitKeys, - getStringWidth, - isFullWidthCodePoint, stripVTControlCharacters }; +if (process.binding('config').hasIntl) { + const icu = process.binding('icu'); + module.exports.getStringWidth = function getStringWidth(str, options) { + options = options || {}; + if (!Number.isInteger(str)) + str = stripVTControlCharacters(String(str)); + return icu.getStringWidth(str, + Boolean(options.ambiguousAsFullWidth), + Boolean(options.expandEmojiSequence)); + }; + module.exports.isFullWidthCodePoint = + function isFullWidthCodePoint(code, options) { + if (typeof code !== 'number') + return false; + return icu.getStringWidth(code, options) === 2; + }; +} else { + /** + * Returns the number of columns required to display the given string. + */ + module.exports.getStringWidth = function getStringWidth(str) { + if (Number.isInteger(str)) + return module.exports.isFullWidthCodePoint(str) ? 2 : 1; -/** - * Returns the number of columns required to display the given string. - */ -function getStringWidth(str) { - let width = 0; + let width = 0; - str = stripVTControlCharacters(str); + str = stripVTControlCharacters(String(str)); - for (var i = 0; i < str.length; i++) { - const code = str.codePointAt(i); + for (var i = 0; i < str.length; i++) { + const code = str.codePointAt(i); - if (code >= 0x10000) { // surrogates - i++; - } + if (code >= 0x10000) { // surrogates + i++; + } - if (isFullWidthCodePoint(code)) { - width += 2; - } else { - width++; + if (module.exports.isFullWidthCodePoint(code)) { + width += 2; + } else { + width++; + } } - } - - return width; -} + return width; + }; -/** - * Returns true if the character represented by a given - * Unicode code point is full-width. Otherwise returns false. - */ -function isFullWidthCodePoint(code) { - if (isNaN(code)) { - return false; - } + /** + * Returns true if the character represented by a given + * Unicode code point is full-width. Otherwise returns false. + */ + module.exports.isFullWidthCodePoint = function isFullWidthCodePoint(code) { + if (!Number.isInteger(code)) { + return false; + } - // Code points are derived from: - // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt - if (code >= 0x1100 && ( - code <= 0x115f || // Hangul Jamo - 0x2329 === code || // LEFT-POINTING ANGLE BRACKET - 0x232a === code || // RIGHT-POINTING ANGLE BRACKET - // CJK Radicals Supplement .. Enclosed CJK Letters and Months - (0x2e80 <= code && code <= 0x3247 && code !== 0x303f) || - // Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A - 0x3250 <= code && code <= 0x4dbf || - // CJK Unified Ideographs .. Yi Radicals - 0x4e00 <= code && code <= 0xa4c6 || - // Hangul Jamo Extended-A - 0xa960 <= code && code <= 0xa97c || - // Hangul Syllables - 0xac00 <= code && code <= 0xd7a3 || - // CJK Compatibility Ideographs - 0xf900 <= code && code <= 0xfaff || - // Vertical Forms - 0xfe10 <= code && code <= 0xfe19 || - // CJK Compatibility Forms .. Small Form Variants - 0xfe30 <= code && code <= 0xfe6b || - // Halfwidth and Fullwidth Forms - 0xff01 <= code && code <= 0xff60 || - 0xffe0 <= code && code <= 0xffe6 || - // Kana Supplement - 0x1b000 <= code && code <= 0x1b001 || - // Enclosed Ideographic Supplement - 0x1f200 <= code && code <= 0x1f251 || - // CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane - 0x20000 <= code && code <= 0x3fffd)) { - return true; - } + // Code points are derived from: + // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt + if (code >= 0x1100 && ( + code <= 0x115f || // Hangul Jamo + 0x2329 === code || // LEFT-POINTING ANGLE BRACKET + 0x232a === code || // RIGHT-POINTING ANGLE BRACKET + // CJK Radicals Supplement .. Enclosed CJK Letters and Months + (0x2e80 <= code && code <= 0x3247 && code !== 0x303f) || + // Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A + 0x3250 <= code && code <= 0x4dbf || + // CJK Unified Ideographs .. Yi Radicals + 0x4e00 <= code && code <= 0xa4c6 || + // Hangul Jamo Extended-A + 0xa960 <= code && code <= 0xa97c || + // Hangul Syllables + 0xac00 <= code && code <= 0xd7a3 || + // CJK Compatibility Ideographs + 0xf900 <= code && code <= 0xfaff || + // Vertical Forms + 0xfe10 <= code && code <= 0xfe19 || + // CJK Compatibility Forms .. Small Form Variants + 0xfe30 <= code && code <= 0xfe6b || + // Halfwidth and Fullwidth Forms + 0xff01 <= code && code <= 0xff60 || + 0xffe0 <= code && code <= 0xffe6 || + // Kana Supplement + 0x1b000 <= code && code <= 0x1b001 || + // Enclosed Ideographic Supplement + 0x1f200 <= code && code <= 0x1f251 || + // CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane + 0x20000 <= code && code <= 0x3fffd)) { + return true; + } - return false; + return false; + }; } - /** * Tries to remove all VT control characters. Use to estimate displayed * string width. May be buggy due to not running a real state machine */ function stripVTControlCharacters(str) { - str = str.replace(new RegExp(functionKeyCodeReAnywhere.source, 'g'), ''); - return str.replace(new RegExp(metaKeyCodeReAnywhere.source, 'g'), ''); + return str.replace(ansi, ''); } diff --git a/lib/readline.js b/lib/readline.js index 3927402f63ef52..9b925a6d990d8b 100644 --- a/lib/readline.js +++ b/lib/readline.js @@ -124,6 +124,14 @@ function Interface(input, output, completer, terminal) { function onkeypress(s, key) { self._ttyWrite(s, key); + if (key && key.sequence) { + // if the key.sequence is half of a surrogate pair + // (>= 0xd800 and <= 0xdfff), refresh the line so + // the character is displayed appropriately. + const ch = key.sequence.codePointAt(0); + if (ch >= 0xd800 && ch <= 0xdfff) + self._refreshLine(); + } } function onresize() { diff --git a/src/node_i18n.cc b/src/node_i18n.cc index f89ae40a558b93..e77591babf4b94 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -31,6 +31,7 @@ #include "v8.h" #include +#include #include #include @@ -185,6 +186,94 @@ static void ToASCII(const FunctionCallbackInfo& args) { len).ToLocalChecked()); } +// This is similar to wcwidth except that it takes the current unicode +// character properties database into consideration, allowing it to +// correctly calculate the column widths of things like emoji's and +// newer wide characters. wcwidth, on the other hand, uses a fixed +// algorithm that does not take things like emoji into proper +// consideration. +static int GetColumnWidth(UChar32 codepoint, + bool ambiguous_as_full_width = false) { + if (!u_isdefined(codepoint) || + u_iscntrl(codepoint) || + u_getCombiningClass(codepoint) > 0 || + u_hasBinaryProperty(codepoint, UCHAR_EMOJI_MODIFIER)) { + return 0; + } + // UCHAR_EAST_ASIAN_WIDTH is the Unicode property that identifies a + // codepoint as being full width, wide, ambiguous, neutral, narrow, + // or halfwidth. + const int eaw = u_getIntPropertyValue(codepoint, UCHAR_EAST_ASIAN_WIDTH); + switch (eaw) { + case U_EA_FULLWIDTH: + case U_EA_WIDE: + return 2; + case U_EA_AMBIGUOUS: + // See: http://www.unicode.org/reports/tr11/#Ambiguous for details + if (ambiguous_as_full_width) { + return 2; + } + // Fall through if ambiguous_as_full_width if false. + case U_EA_NEUTRAL: + if (u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION)) { + return 2; + } + // Fall through + case U_EA_HALFWIDTH: + case U_EA_NARROW: + default: + return 1; + } +} + +// Returns the column width for the given String. +static void GetStringWidth(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + if (args.Length() < 1) + return; + + bool ambiguous_as_full_width = args[1]->BooleanValue(); + bool expand_emoji_sequence = args[2]->BooleanValue(); + + if (args[0]->IsNumber()) { + args.GetReturnValue().Set( + GetColumnWidth(args[0]->Uint32Value(), + ambiguous_as_full_width)); + return; + } + + TwoByteValue value(env->isolate(), args[0]); + // reinterpret_cast is required by windows to compile + UChar* str = reinterpret_cast(*value); + UChar32 c; + UChar32 p; + size_t n = 0; + uint32_t width = 0; + + while (n < value.length()) { + p = c; + U16_NEXT(str, n, value.length(), c); + // Don't count individual emoji codepoints that occur within an + // emoji sequence. This is not necessarily foolproof. Some + // environments display emoji sequences in the appropriate + // condensed form (as a single emoji glyph), other environments + // may not understand an emoji sequence and will display each + // individual emoji separately. When this happens, the width + // calculated will be off, and there's no reliable way of knowing + // in advance if a particular sequence is going to be supported. + // The expand_emoji_sequence option allows the caller to skip this + // check and count each code within an emoji sequence separately. + if (!expand_emoji_sequence && + n > 0 && p == 0x200d && // 0x200d == ZWJ (zero width joiner) + (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) || + u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER))) { + continue; + } + width += GetColumnWidth(c, ambiguous_as_full_width); + } + args.GetReturnValue().Set(width); +} + void Init(Local target, Local unused, Local context, @@ -192,6 +281,7 @@ void Init(Local target, Environment* env = Environment::GetCurrent(context); env->SetMethod(target, "toUnicode", ToUnicode); env->SetMethod(target, "toASCII", ToASCII); + env->SetMethod(target, "getStringWidth", GetStringWidth); } } // namespace i18n diff --git a/test/parallel/test-icu-stringwidth.js b/test/parallel/test-icu-stringwidth.js new file mode 100644 index 00000000000000..5b66f00c32840c --- /dev/null +++ b/test/parallel/test-icu-stringwidth.js @@ -0,0 +1,43 @@ +// Flags: --expose_internals +'use strict'; + +const common = require('../common'); +const assert = require('assert'); +const readline = require('internal/readline'); + +if (!process.binding('config').hasIntl) { + common.skip('missing intl... skipping test'); + return; +} + +// Test column width +assert.strictEqual(readline.getStringWidth('a'), 1); +assert.strictEqual(readline.getStringWidth('丁'), 2); +assert.strictEqual(readline.getStringWidth('\ud83d\udc78\ud83c\udfff'), 2); +assert.strictEqual(readline.getStringWidth('πŸ‘…'), 2); +assert.strictEqual(readline.getStringWidth('\n'), 0); +assert.strictEqual(readline.getStringWidth('\u200Ef\u200F'), 1); +assert.strictEqual(readline.getStringWidth(97), 1); + +// The following is an emoji sequence. In some implementations, it is +// represented as a single glyph, in other implementations as a sequence +// of individual glyphs. By default, the algorithm will assume the single +// glyph interpretation and return a value of 2. By passing the +// expandEmojiSequence: true option, each component will be counted +// individually. +assert.strictEqual(readline.getStringWidth('πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§'), 2); +assert.strictEqual( + readline.getStringWidth('πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§', {expandEmojiSequence: true}), 8); + +// By default, unicode characters whose width is considered ambiguous will +// be considered half-width. For these characters, getStringWidth will return +// 1. In some contexts, however, it is more appropriate to consider them full +// width. By default, the algorithm will assume half width. By passing +// the ambiguousAsFullWidth: true option, ambiguous characters will be counted +// as 2 columns. +assert.strictEqual(readline.getStringWidth('\u01d4'), 1); +assert.strictEqual( + readline.getStringWidth('\u01d4', {ambiguousAsFullWidth: true}), 2); + +// Control chars and combining chars are zero +assert.strictEqual(readline.getStringWidth('\u200E\n\u220A\u20D2'), 1);