Skip to content

Commit

Permalink
Update the list of double width codepoints
Browse files Browse the repository at this point in the history
All east asian width wide and full-width codepoints
are considered double width, as well as emojis and
symbols commonely rendered as emoji.

Reviewed By: aaron.ballman

Differential Revision: https://reviews.llvm.org/D138518
  • Loading branch information
cor3ntin committed Nov 28, 2022
1 parent 9e5fa4b commit 2903769
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 22 deletions.
61 changes: 39 additions & 22 deletions llvm/lib/Support/Unicode.cpp
Expand Up @@ -300,8 +300,7 @@ bool isFormatting(int UCS) {
/// * 0 for non-spacing and enclosing combining marks;
/// * 2 for CJK characters excluding halfwidth forms;
/// * 1 for all remaining characters.
static inline int charWidth(int UCS)
{
static inline int charWidth(int UCS) {
if (!isPrintable(UCS))
return ErrorNonPrintableCharacter;

Expand Down Expand Up @@ -430,26 +429,45 @@ static inline int charWidth(int UCS)
if (CombiningCharacters.contains(UCS))
return 0;

// We consider double width codepoints any codepoint with
// the property East_Asian_Width=F|W
// + Misc Symbols and Pictographs (U+1F300...U+1F5FF)
// + Supplemental Symbols and Pictographs (U+1F900...U+1F9FF)
static const UnicodeCharRange DoubleWidthCharacterRanges[] = {
// Hangul Jamo
{ 0x1100, 0x11FF },
// Deprecated fullwidth angle brackets
{ 0x2329, 0x232A },
// CJK Misc, CJK Unified Ideographs, Yijing Hexagrams, Yi
// excluding U+303F (IDEOGRAPHIC HALF FILL SPACE)
{ 0x2E80, 0x303E }, { 0x3040, 0xA4CF },
// Hangul
{ 0xAC00, 0xD7A3 }, { 0xD7B0, 0xD7C6 }, { 0xD7CB, 0xD7FB },
// CJK Unified Ideographs
{ 0xF900, 0xFAFF },
// Vertical forms
{ 0xFE10, 0xFE19 },
// CJK Compatibility Forms + Small Form Variants
{ 0xFE30, 0xFE6F },
// Fullwidth forms
{ 0xFF01, 0xFF60 }, { 0xFFE0, 0xFFE6 },
// CJK Unified Ideographs
{ 0x20000, 0x2A6DF }, { 0x2A700, 0x2B81F }, { 0x2F800, 0x2FA1F }
{0x1100, 0x115F}, {0x231A, 0x231B}, {0x2329, 0x232A},
{0x23E9, 0x23EC}, {0x23F0, 0x23F0}, {0x23F3, 0x23F3},
{0x25FD, 0x25FE}, {0x2614, 0x2615}, {0x2648, 0x2653},
{0x267F, 0x267F}, {0x2693, 0x2693}, {0x26A1, 0x26A1},
{0x26AA, 0x26AB}, {0x26BD, 0x26BE}, {0x26C4, 0x26C5},
{0x26CE, 0x26CE}, {0x26D4, 0x26D4}, {0x26EA, 0x26EA},
{0x26F2, 0x26F3}, {0x26F5, 0x26F5}, {0x26FA, 0x26FA},
{0x26FD, 0x26FD}, {0x2705, 0x2705}, {0x270A, 0x270B},
{0x2728, 0x2728}, {0x274C, 0x274C}, {0x274E, 0x274E},
{0x2753, 0x2755}, {0x2757, 0x2757}, {0x2795, 0x2797},
{0x27B0, 0x27B0}, {0x27BF, 0x27BF}, {0x2B1B, 0x2B1C},
{0x2B50, 0x2B50}, {0x2B55, 0x2B55}, {0x2E80, 0x2E99},
{0x2E9B, 0x2EF3}, {0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB},
{0x3000, 0x303E}, {0x3041, 0x3096}, {0x3099, 0x30FF},
{0x3105, 0x312F}, {0x3131, 0x318E}, {0x3190, 0x31E3},
{0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0xA48C},
{0xA490, 0xA4C6}, {0xA960, 0xA97C}, {0xAC00, 0xD7A3},
{0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52},
{0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60},
{0xFFE0, 0xFFE6}, {0x16FE0, 0x16FE4}, {0x16FF0, 0x16FF1},
{0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
{0x1AFF0, 0x1AFF3}, {0x1AFF5, 0x1AFFB}, {0x1AFFD, 0x1AFFE},
{0x1B000, 0x1B122}, {0x1B132, 0x1B132}, {0x1B150, 0x1B152},
{0x1B155, 0x1B155}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB},
{0x1F004, 0x1F004}, {0x1F0CF, 0x1F0CF}, {0x1F18E, 0x1F18E},
{0x1F191, 0x1F19A}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23B},
{0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265},
{0x1F300, 0x1F64F}, {0x1F680, 0x1F6C5}, {0x1F6CC, 0x1F6CC},
{0x1F6D0, 0x1F6D2}, {0x1F6D5, 0x1F6D7}, {0x1F6DC, 0x1F6DF},
{0x1F6EB, 0x1F6EC}, {0x1F6F4, 0x1F6FC}, {0x1F7E0, 0x1F7EB},
{0x1F7F0, 0x1F7F0}, {0x1F900, 0x1F9FF}, {0x1FA70, 0x1FA7C},
{0x1FA80, 0x1FA88}, {0x1FA90, 0x1FABD}, {0x1FABF, 0x1FAC5},
{0x1FACE, 0x1FADB}, {0x1FAE0, 0x1FAE8}, {0x1FAF0, 0x1FAF8},
{0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}
};
static const UnicodeCharSet DoubleWidthCharacters(DoubleWidthCharacterRanges);

Expand Down Expand Up @@ -493,4 +511,3 @@ int columnWidthUTF8(StringRef Text) {
} // namespace unicode
} // namespace sys
} // namespace llvm

5 changes: 5 additions & 0 deletions llvm/unittests/Support/UnicodeTest.cpp
Expand Up @@ -45,6 +45,11 @@ TEST(Unicode, columnWidthUTF8) {
EXPECT_EQ(3, columnWidthUTF8("q\344\270\200"));
EXPECT_EQ(3, columnWidthUTF8("\314\200\340\270\201\344\270\200"));

EXPECT_EQ(2, columnWidthUTF8("\u231A")); // WATCH (emoji)
EXPECT_EQ(2, columnWidthUTF8("\U0001FADB")); // PEA POD (Unicode 15 emoji)
EXPECT_EQ(2, columnWidthUTF8("\U0001B132")); // HIRAGANA LETTER SMALL KO
EXPECT_EQ(2, columnWidthUTF8("\U00017042")); // TANGUT IDEOGRAPH

// Invalid UTF-8 strings, columnWidthUTF8 should error out.
EXPECT_EQ(-2, columnWidthUTF8("\344"));
EXPECT_EQ(-2, columnWidthUTF8("\344\270"));
Expand Down

0 comments on commit 2903769

Please sign in to comment.