From f4a265b2c78e1c0b7e3e7f6ae0f06c6c2fef87fd Mon Sep 17 00:00:00 2001 From: Michael Telatynski <7t3chguy@gmail.com> Date: Thu, 25 May 2023 09:32:20 +0100 Subject: [PATCH] Use grapheme-splitter instead of lodash for saving emoji from being ripped apart (#10976) * Use grapheme-splitter instead of lodash for saving emoji from being ripped apart * Move to a more appropriate place * Add tests and improve types --- package.json | 1 + src/Avatar.ts | 5 ++--- src/HtmlUtils.tsx | 15 ++++++++++----- src/editor/parts.ts | 13 ++++++------- src/utils/strings.ts | 13 +++++++++++++ test/HtmlUtils-test.tsx | 18 +++++++++++++++++- test/__snapshots__/HtmlUtils-test.tsx.snap | 12 ++++++------ 7 files changed, 55 insertions(+), 22 deletions(-) diff --git a/package.json b/package.json index 959f0199947..b9a364044c6 100644 --- a/package.json +++ b/package.json @@ -82,6 +82,7 @@ "focus-visible": "^5.2.0", "gfm.css": "^1.1.2", "glob-to-regexp": "^0.4.1", + "grapheme-splitter": "^1.0.4", "highlight.js": "^11.3.1", "html-entities": "^2.0.0", "is-ip": "^3.1.0", diff --git a/src/Avatar.ts b/src/Avatar.ts index 79254ef1b59..3873a1a59de 100644 --- a/src/Avatar.ts +++ b/src/Avatar.ts @@ -18,11 +18,11 @@ import { RoomMember } from "matrix-js-sdk/src/models/room-member"; import { User } from "matrix-js-sdk/src/models/user"; import { Room } from "matrix-js-sdk/src/models/room"; import { ResizeMethod } from "matrix-js-sdk/src/@types/partials"; -import { split } from "lodash"; import DMRoomMap from "./utils/DMRoomMap"; import { mediaFromMxc } from "./customisations/Media"; import { isLocalRoom } from "./utils/localRoom/isLocalRoom"; +import { getFirstGrapheme } from "./utils/strings"; // Not to be used for BaseAvatar urls as that has similar default avatar fallback already export function avatarUrlForMember( @@ -133,8 +133,7 @@ export function getInitialLetter(name: string): string | undefined { name = name.substring(1); } - // rely on the grapheme cluster splitter in lodash so that we don't break apart compound emojis - return split(name, "", 1)[0].toUpperCase(); + return getFirstGrapheme(name).toUpperCase(); } export function avatarUrlForRoom( diff --git a/src/HtmlUtils.tsx b/src/HtmlUtils.tsx index 0d910bfecad..866c1d0a0c3 100644 --- a/src/HtmlUtils.tsx +++ b/src/HtmlUtils.tsx @@ -21,13 +21,14 @@ import React, { LegacyRef, ReactElement, ReactNode } from "react"; import sanitizeHtml from "sanitize-html"; import classNames from "classnames"; import EMOJIBASE_REGEX from "emojibase-regex"; -import { merge, split } from "lodash"; +import { merge } from "lodash"; import katex from "katex"; import { decode } from "html-entities"; import { IContent } from "matrix-js-sdk/src/models/event"; import { Optional } from "matrix-events-sdk"; import _Linkify from "linkify-react"; import escapeHtml from "escape-html"; +import GraphemeSplitter from "grapheme-splitter"; import { _linkifyElement, @@ -463,14 +464,18 @@ const emojiToJsxSpan = (emoji: string, key: number): JSX.Element => ( * @returns if isHtmlMessage is true, returns an array of strings, otherwise return an array of React Elements for emojis * and plain text for everything else */ -function formatEmojis(message: string | undefined, isHtmlMessage: boolean): (JSX.Element | string)[] { +export function formatEmojis(message: string | undefined, isHtmlMessage?: false): JSX.Element[]; +export function formatEmojis(message: string | undefined, isHtmlMessage: true): string[]; +export function formatEmojis(message: string | undefined, isHtmlMessage: boolean): (JSX.Element | string)[] { const emojiToSpan = isHtmlMessage ? emojiToHtmlSpan : emojiToJsxSpan; const result: (JSX.Element | string)[] = []; + if (!message) return result; + let text = ""; let key = 0; - // We use lodash's grapheme splitter to avoid breaking apart compound emojis - for (const char of split(message, "")) { + const splitter = new GraphemeSplitter(); + for (const char of splitter.iterateGraphemes(message)) { if (EMOJIBASE_REGEX.test(char)) { if (text) { result.push(text); @@ -661,7 +666,7 @@ export function topicToHtml( isFormattedTopic = false; // Fall back to plain-text topic } - let emojiBodyElements: ReturnType | undefined; + let emojiBodyElements: JSX.Element[] | undefined; if (!isFormattedTopic && topicHasEmoji) { emojiBodyElements = formatEmojis(topic, false); } diff --git a/src/editor/parts.ts b/src/editor/parts.ts index e25b582e207..93d79fdf059 100644 --- a/src/editor/parts.ts +++ b/src/editor/parts.ts @@ -15,11 +15,11 @@ See the License for the specific language governing permissions and limitations under the License. */ -import { split } from "lodash"; import EMOJIBASE_REGEX from "emojibase-regex"; import { MatrixClient } from "matrix-js-sdk/src/client"; import { RoomMember } from "matrix-js-sdk/src/models/room-member"; import { Room } from "matrix-js-sdk/src/models/room"; +import GraphemeSplitter from "grapheme-splitter"; import AutocompleteWrapperModel, { GetAutocompleterComponent, UpdateCallback, UpdateQuery } from "./autocomplete"; import { unicodeToShortcode } from "../HtmlUtils"; @@ -27,6 +27,7 @@ import * as Avatar from "../Avatar"; import defaultDispatcher from "../dispatcher/dispatcher"; import { Action } from "../dispatcher/actions"; import SettingsStore from "../settings/SettingsStore"; +import { getFirstGrapheme } from "../utils/strings"; const REGIONAL_EMOJI_SEPARATOR = String.fromCodePoint(0x200b); @@ -133,8 +134,7 @@ abstract class BasePart { // To only need to grapheme split the bits of the string we're working on. let buffer = str; while (buffer) { - // We use lodash's grapheme splitter to avoid breaking apart compound emojis - const [char] = split(buffer, "", 2); + const char = getFirstGrapheme(buffer); if (!this.acceptsInsertion(char, offset + str.length - buffer.length, inputType)) { break; } @@ -562,8 +562,7 @@ export class PartCreator { case "\n": return new NewlinePart(); default: - // We use lodash's grapheme splitter to avoid breaking apart compound emojis - if (EMOJIBASE_REGEX.test(split(input, "", 2)[0])) { + if (EMOJIBASE_REGEX.test(getFirstGrapheme(input))) { return new EmojiPart(); } return new PlainPart(); @@ -639,8 +638,8 @@ export class PartCreator { const parts: (PlainPart | EmojiPart)[] = []; let plainText = ""; - // We use lodash's grapheme splitter to avoid breaking apart compound emojis - for (const char of split(text, "")) { + const splitter = new GraphemeSplitter(); + for (const char of splitter.iterateGraphemes(text)) { if (EMOJIBASE_REGEX.test(char)) { if (plainText) { parts.push(this.plain(plainText)); diff --git a/src/utils/strings.ts b/src/utils/strings.ts index d065339f1f1..584b4204c98 100644 --- a/src/utils/strings.ts +++ b/src/utils/strings.ts @@ -21,6 +21,7 @@ limitations under the License. * @param text the plaintext to put in the user's clipboard */ import { logger } from "matrix-js-sdk/src/logger"; +import GraphemeSplitter from "grapheme-splitter"; export async function copyPlaintext(text: string): Promise { try { @@ -83,3 +84,15 @@ export function copyNode(ref?: Element | null): boolean { export function getSelectedText(): string { return window.getSelection()!.toString(); } + +/** + * Returns the first grapheme in the given string, + * especially useful for strings containing emoji, will not break compound emoji up. + * @param str string to parse + * @returns the first grapheme or an empty string if given an empty string + */ +export function getFirstGrapheme(str: string): string { + const splitter = new GraphemeSplitter(); + const result = splitter.iterateGraphemes(str).next(); + return result.done ? "" : result.value; +} diff --git a/test/HtmlUtils-test.tsx b/test/HtmlUtils-test.tsx index bca781792a2..779e694f83c 100644 --- a/test/HtmlUtils-test.tsx +++ b/test/HtmlUtils-test.tsx @@ -19,7 +19,7 @@ import { mocked } from "jest-mock"; import { render, screen } from "@testing-library/react"; import { IContent } from "matrix-js-sdk/src/models/event"; -import { bodyToHtml, topicToHtml } from "../src/HtmlUtils"; +import { bodyToHtml, formatEmojis, topicToHtml } from "../src/HtmlUtils"; import SettingsStore from "../src/settings/SettingsStore"; jest.mock("../src/settings/SettingsStore"); @@ -168,3 +168,19 @@ describe("bodyToHtml", () => { }); }); }); + +describe("formatEmojis", () => { + it.each([ + ["🏴󠁧󠁢󠁥󠁮󠁧󠁿", [["🏴󠁧󠁢󠁥󠁮󠁧󠁿", "flag-england"]]], + ["🏴󠁧󠁢󠁳󠁣󠁴󠁿", [["🏴󠁧󠁢󠁳󠁣󠁴󠁿", "flag-scotland"]]], + ["🏴󠁧󠁢󠁷󠁬󠁳󠁿", [["🏴󠁧󠁢󠁷󠁬󠁳󠁿", "flag-wales"]]], + ])("%s emoji", (emoji, expectations) => { + const res = formatEmojis(emoji, false); + expect(res).toHaveLength(expectations.length); + for (let i = 0; i < res.length; i++) { + const [emoji, title] = expectations[i]; + expect(res[i].props.children).toEqual(emoji); + expect(res[i].props.title).toEqual(`:${title}:`); + } + }); +}); diff --git a/test/__snapshots__/HtmlUtils-test.tsx.snap b/test/__snapshots__/HtmlUtils-test.tsx.snap index 0714120676c..c4d91467c07 100644 --- a/test/__snapshots__/HtmlUtils-test.tsx.snap +++ b/test/__snapshots__/HtmlUtils-test.tsx.snap @@ -1,5 +1,11 @@ // Jest Snapshot v1, https://goo.gl/fbAQLP +exports[`bodyToHtml feature_latex_maths should not mangle code blocks 1`] = `"

hello

$\\xi$

world

"`; + +exports[`bodyToHtml feature_latex_maths should render block katex 1`] = `"

hello

ξ\\xi

world

"`; + +exports[`bodyToHtml feature_latex_maths should render inline katex 1`] = `"hello ξ\\xi world"`; + exports[`bodyToHtml should generate big emoji for an emoji-only reply to a message 1`] = ` `; - -exports[`bodyToHtml feature_latex_maths should not mangle code blocks 1`] = `"

hello

$\\xi$

world

"`; - -exports[`bodyToHtml feature_latex_maths should render block katex 1`] = `"

hello

ξ\\xi

world

"`; - -exports[`bodyToHtml feature_latex_maths should render inline katex 1`] = `"hello ξ\\xi world"`;