From ac0f0c4e64cf1a96ff5feac568cd38039ac1c3ec Mon Sep 17 00:00:00 2001 From: JiuqingSong Date: Tue, 19 May 2026 13:24:58 -0700 Subject: [PATCH 1/4] Filter out invisible unicode characters --- .../lib/modelApi/creators/createText.ts | 17 ++++- .../test/endToEndTest.ts | 34 ++++++++++ .../test/modelApi/creators/creatorsTest.ts | 68 +++++++++++++++++++ 3 files changed, 118 insertions(+), 1 deletion(-) diff --git a/packages/roosterjs-content-model-dom/lib/modelApi/creators/createText.ts b/packages/roosterjs-content-model-dom/lib/modelApi/creators/createText.ts index c837d11f5c96..4b8a3e48ae95 100644 --- a/packages/roosterjs-content-model-dom/lib/modelApi/creators/createText.ts +++ b/packages/roosterjs-content-model-dom/lib/modelApi/creators/createText.ts @@ -19,9 +19,10 @@ export function createText( link?: ReadonlyContentModelLink, code?: ReadonlyContentModelCode ): ContentModelText { + const filterText = stripInvisibleUnicode(text); const result: ContentModelText = { segmentType: 'Text', - text: text, + text: filterText, format: { ...format }, }; @@ -35,3 +36,17 @@ export function createText( return result; } + +// According to https://embracethered.com/blog/posts/2024/hiding-and-finding-text-with-unicode-tags/ +// there are some invisible unicode characters in the range of U+E0000 to U+EFFFF, which are used for hiding text in HTML. +// We need to strip them out before processing the pasted content, otherwise they will be treated as normal text and cause unexpected behavior. +const INVISIBLE_UNICODE_REGEX = /[\u{E0000}-\u{EFFFF}]/gu; + +/** + * Strip invisible unicode characters from the given string + * @param value The string to be processed + * @returns The string with invisible unicode characters removed + */ +function stripInvisibleUnicode(value: string): string { + return value.replace(INVISIBLE_UNICODE_REGEX, ''); +} diff --git a/packages/roosterjs-content-model-dom/test/endToEndTest.ts b/packages/roosterjs-content-model-dom/test/endToEndTest.ts index 8d55177d6929..ff6bcc5af49a 100644 --- a/packages/roosterjs-content-model-dom/test/endToEndTest.ts +++ b/packages/roosterjs-content-model-dom/test/endToEndTest.ts @@ -3028,6 +3028,40 @@ describe('End to end test for DOM => Model => DOM/TEXT', () => { ); }); + it('Text with invisible unicode tag characters is stripped, meaningful invisible chars preserved', () => { + // Source HTML contains U+E0041 / U+E0042 (unicode tag range — must be stripped) + // mixed with U+200B (ZWSP), U+200D (ZWJ), U+202E (RLO), U+202C (PDF) + // which must be preserved. + runTest( + '

a\u{E0041}b\u{200B}c\u{E0042}d\u{202E}evil\u{202C}e

', + { + blockGroupType: 'Document', + blocks: [ + { + blockType: 'Paragraph', + segments: [ + { + segmentType: 'Text', + text: 'ab\u{200B}cd\u{202E}evil\u{202C}e', + format: {}, + }, + ], + format: { + marginTop: '1em', + marginBottom: '1em', + }, + decorator: { + tagName: 'p', + format: {}, + }, + }, + ], + }, + 'ab\u{200B}cd\u{202E}evil\u{202C}e', + '

ab\u{200B}cd\u{202E}evil\u{202C}e

' + ); + }); + it('LI without UL followed by other blocks', () => { runTest( '
  • test
  • other
    ', diff --git a/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts b/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts index 2429877479b3..e40927c6f377 100644 --- a/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts +++ b/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts @@ -233,6 +233,74 @@ describe('Creators', () => { }); }); + it('createText with invisible unicode characters', () => { + const text = 'a\u{E0041}b\u{E0042}c'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: 'abc', + }); + }); + + it('createText with only invisible unicode characters', () => { + const text = '\u{E0000}\u{E007F}\u{EFFFF}'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: '', + }); + }); + + it('createText with invisible unicode at boundary range', () => { + const text = '\u{DFFFF}start\u{E0000}mid\u{EFFFF}end\u{F0000}'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: '\u{DFFFF}startmidend\u{F0000}', + }); + }); + + it('createText preserves meaningful invisible characters outside the tag range', () => { + // ​ = Zero-Width Space, ‍ = Zero-Width Joiner, + // ‮ = Right-to-Left Override, ‬ = Pop Directional Formatting + const text = 'a​b‍c‮d‬e'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: 'a​b‍c‮d‬e', + }); + }); + + it('createText strips only tag-range chars, keeps meaningful invisible chars', () => { + const text = 'a​\u{E0041}b‮\u{E0042}c'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: 'a​b‮c', + }); + }); + + it('createText does not strip visible characters', () => { + const text = 'hello world 你好   ​'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: 'hello world 你好   ​', + }); + }); + it('createTableRow', () => { const row = createTableRow(); From c72c3ccbe0fcd798c6412e97632eda7c5c8ed006 Mon Sep 17 00:00:00 2001 From: jiuqingsong Date: Thu, 28 May 2026 09:57:25 -0700 Subject: [PATCH 2/4] Gate invisible unicode stripping behind FilterInvisibleUnicode experimental feature Move the invisible unicode character stripping logic from createText (always-on) to addTextSegment, gated by the new 'FilterInvisibleUnicode' experimental feature. This ensures the behavior only activates when explicitly enabled via EditorOptions. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../lib/modelApi/common/addTextSegment.ts | 9 +- .../modelApi/common/stripInvisibleUnicode.ts | 13 +++ .../lib/modelApi/creators/createText.ts | 17 +--- .../test/endToEndTest.ts | 87 +++++++++++++------ .../modelApi/common/addTextSegmentTest.ts | 52 +++++++++++ .../test/modelApi/creators/creatorsTest.ts | 27 ++---- .../lib/editor/ExperimentalFeature.ts | 9 +- 7 files changed, 152 insertions(+), 62 deletions(-) create mode 100644 packages/roosterjs-content-model-dom/lib/modelApi/common/stripInvisibleUnicode.ts diff --git a/packages/roosterjs-content-model-dom/lib/modelApi/common/addTextSegment.ts b/packages/roosterjs-content-model-dom/lib/modelApi/common/addTextSegment.ts index dc8536468de5..4a93d275148c 100644 --- a/packages/roosterjs-content-model-dom/lib/modelApi/common/addTextSegment.ts +++ b/packages/roosterjs-content-model-dom/lib/modelApi/common/addTextSegment.ts @@ -4,6 +4,7 @@ import { createText } from '../creators/createText'; import { ensureParagraph } from './ensureParagraph'; import { hasSpacesOnly } from './hasSpacesOnly'; import { isWhiteSpacePreserved } from '../../domUtils/isWhiteSpacePreserved'; +import { stripInvisibleUnicode } from './stripInvisibleUnicode'; import type { ContentModelBlockGroup, ContentModelText, @@ -32,7 +33,13 @@ export function addTextSegment( (paragraph?.segments.length ?? 0) > 0 || isWhiteSpacePreserved(paragraph?.format.whiteSpace) ) { - textModel = createText(text, context.segmentFormat); + const filteredText = + context.experimentalFeatures && + context.experimentalFeatures.indexOf('FilterInvisibleUnicode') > -1 + ? stripInvisibleUnicode(text) + : text; + + textModel = createText(filteredText, context.segmentFormat); if (context.isInSelection) { textModel.isSelected = true; diff --git a/packages/roosterjs-content-model-dom/lib/modelApi/common/stripInvisibleUnicode.ts b/packages/roosterjs-content-model-dom/lib/modelApi/common/stripInvisibleUnicode.ts new file mode 100644 index 000000000000..1b009fd41238 --- /dev/null +++ b/packages/roosterjs-content-model-dom/lib/modelApi/common/stripInvisibleUnicode.ts @@ -0,0 +1,13 @@ +// According to https://embracethered.com/blog/posts/2024/hiding-and-finding-text-with-unicode-tags/ +// there are some invisible unicode characters in the range of U+E0000 to U+EFFFF, which are used for hiding text in HTML. +// We need to strip them out before processing the pasted content, otherwise they will be treated as normal text and cause unexpected behavior. +const INVISIBLE_UNICODE_REGEX = /[\u{E0000}-\u{EFFFF}]/gu; + +/** + * Strip invisible unicode characters from the given string + * @param value The string to be processed + * @returns The string with invisible unicode characters removed + */ +export function stripInvisibleUnicode(value: string): string { + return value.replace(INVISIBLE_UNICODE_REGEX, ''); +} diff --git a/packages/roosterjs-content-model-dom/lib/modelApi/creators/createText.ts b/packages/roosterjs-content-model-dom/lib/modelApi/creators/createText.ts index 4b8a3e48ae95..c837d11f5c96 100644 --- a/packages/roosterjs-content-model-dom/lib/modelApi/creators/createText.ts +++ b/packages/roosterjs-content-model-dom/lib/modelApi/creators/createText.ts @@ -19,10 +19,9 @@ export function createText( link?: ReadonlyContentModelLink, code?: ReadonlyContentModelCode ): ContentModelText { - const filterText = stripInvisibleUnicode(text); const result: ContentModelText = { segmentType: 'Text', - text: filterText, + text: text, format: { ...format }, }; @@ -36,17 +35,3 @@ export function createText( return result; } - -// According to https://embracethered.com/blog/posts/2024/hiding-and-finding-text-with-unicode-tags/ -// there are some invisible unicode characters in the range of U+E0000 to U+EFFFF, which are used for hiding text in HTML. -// We need to strip them out before processing the pasted content, otherwise they will be treated as normal text and cause unexpected behavior. -const INVISIBLE_UNICODE_REGEX = /[\u{E0000}-\u{EFFFF}]/gu; - -/** - * Strip invisible unicode characters from the given string - * @param value The string to be processed - * @returns The string with invisible unicode characters removed - */ -function stripInvisibleUnicode(value: string): string { - return value.replace(INVISIBLE_UNICODE_REGEX, ''); -} diff --git a/packages/roosterjs-content-model-dom/test/endToEndTest.ts b/packages/roosterjs-content-model-dom/test/endToEndTest.ts index 844b4d133201..87807d48cbdc 100644 --- a/packages/roosterjs-content-model-dom/test/endToEndTest.ts +++ b/packages/roosterjs-content-model-dom/test/endToEndTest.ts @@ -3271,38 +3271,75 @@ describe('End to end test for DOM => Model => DOM/TEXT', () => { ); }); - it('Text with invisible unicode tag characters is stripped, meaningful invisible chars preserved', () => { + it('Text with invisible unicode tag characters is stripped when FilterInvisibleUnicode feature is enabled', () => { // Source HTML contains U+E0041 / U+E0042 (unicode tag range — must be stripped) // mixed with U+200B (ZWSP), U+200D (ZWJ), U+202E (RLO), U+202C (PDF) // which must be preserved. - runTest( - '

    a\u{E0041}b\u{200B}c\u{E0042}d\u{202E}evil\u{202C}e

    ', - { - blockGroupType: 'Document', - blocks: [ - { - blockType: 'Paragraph', - segments: [ - { - segmentType: 'Text', - text: 'ab\u{200B}cd\u{202E}evil\u{202C}e', - format: {}, - }, - ], - format: { - marginTop: '1em', - marginBottom: '1em', + const div1 = document.createElement('div'); + div1.innerHTML = '

    a\u{E0041}b\u{200B}c\u{E0042}d\u{202E}evil\u{202C}e

    '; + + const model = domToContentModel( + div1, + createDomToModelContext({ experimentalFeatures: ['FilterInvisibleUnicode'] }) + ); + + expect(model).toEqual({ + blockGroupType: 'Document', + blocks: [ + { + blockType: 'Paragraph', + segments: [ + { + segmentType: 'Text', + text: 'ab\u{200B}cd\u{202E}evil\u{202C}e', + format: {}, }, - decorator: { - tagName: 'p', + ], + format: { + marginTop: '1em', + marginBottom: '1em', + }, + decorator: { + tagName: 'p', + format: {}, + }, + }, + ], + }); + + const text = contentModelToText(model); + expect(text).toBe('ab\u{200B}cd\u{202E}evil\u{202C}e'); + }); + + it('Text with invisible unicode tag characters is NOT stripped when feature is disabled', () => { + const div1 = document.createElement('div'); + div1.innerHTML = '

    a\u{E0041}b\u{E0042}c

    '; + + const model = domToContentModel(div1, createDomToModelContext()); + + expect(model).toEqual({ + blockGroupType: 'Document', + blocks: [ + { + blockType: 'Paragraph', + segments: [ + { + segmentType: 'Text', + text: 'a\u{E0041}b\u{E0042}c', format: {}, }, + ], + format: { + marginTop: '1em', + marginBottom: '1em', }, - ], - }, - 'ab\u{200B}cd\u{202E}evil\u{202C}e', - '

    ab\u{200B}cd\u{202E}evil\u{202C}e

    ' - ); + decorator: { + tagName: 'p', + format: {}, + }, + }, + ], + }); }); it('LI without UL followed by other blocks', () => { diff --git a/packages/roosterjs-content-model-dom/test/modelApi/common/addTextSegmentTest.ts b/packages/roosterjs-content-model-dom/test/modelApi/common/addTextSegmentTest.ts index c3ccabd1dfa1..9d83c04ca885 100644 --- a/packages/roosterjs-content-model-dom/test/modelApi/common/addTextSegmentTest.ts +++ b/packages/roosterjs-content-model-dom/test/modelApi/common/addTextSegmentTest.ts @@ -206,4 +206,56 @@ describe('addTextSegment', () => { ], }); }); + + it('Add text with invisible unicode, feature enabled', () => { + const group = createContentModelDocument(); + const context = createDomToModelContext({ + experimentalFeatures: ['FilterInvisibleUnicode'], + }); + + addTextSegment(group, 'a\u{E0041}b\u{E0042}c', context); + + expect(group).toEqual({ + blockGroupType: 'Document', + blocks: [ + { + blockType: 'Paragraph', + format: {}, + segments: [ + { + segmentType: 'Text', + text: 'abc', + format: {}, + }, + ], + isImplicit: true, + }, + ], + }); + }); + + it('Add text with invisible unicode, feature disabled', () => { + const group = createContentModelDocument(); + const context = createDomToModelContext(); + + addTextSegment(group, 'a\u{E0041}b\u{E0042}c', context); + + expect(group).toEqual({ + blockGroupType: 'Document', + blocks: [ + { + blockType: 'Paragraph', + format: {}, + segments: [ + { + segmentType: 'Text', + text: 'a\u{E0041}b\u{E0042}c', + format: {}, + }, + ], + isImplicit: true, + }, + ], + }); + }); }); diff --git a/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts b/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts index e40927c6f377..21dae0216a4e 100644 --- a/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts +++ b/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts @@ -233,36 +233,36 @@ describe('Creators', () => { }); }); - it('createText with invisible unicode characters', () => { + it('createText with invisible unicode characters does not strip by default', () => { const text = 'a\u{E0041}b\u{E0042}c'; const result = createText(text); expect(result).toEqual({ segmentType: 'Text', format: {}, - text: 'abc', + text: 'a\u{E0041}b\u{E0042}c', }); }); - it('createText with only invisible unicode characters', () => { + it('createText with only invisible unicode characters does not strip by default', () => { const text = '\u{E0000}\u{E007F}\u{EFFFF}'; const result = createText(text); expect(result).toEqual({ segmentType: 'Text', format: {}, - text: '', + text: '\u{E0000}\u{E007F}\u{EFFFF}', }); }); - it('createText with invisible unicode at boundary range', () => { + it('createText with invisible unicode at boundary range does not strip by default', () => { const text = '\u{DFFFF}start\u{E0000}mid\u{EFFFF}end\u{F0000}'; const result = createText(text); expect(result).toEqual({ segmentType: 'Text', format: {}, - text: '\u{DFFFF}startmidend\u{F0000}', + text: '\u{DFFFF}start\u{E0000}mid\u{EFFFF}end\u{F0000}', }); }); @@ -279,25 +279,14 @@ describe('Creators', () => { }); }); - it('createText strips only tag-range chars, keeps meaningful invisible chars', () => { - const text = 'a​\u{E0041}b‮\u{E0042}c'; - const result = createText(text); - - expect(result).toEqual({ - segmentType: 'Text', - format: {}, - text: 'a​b‮c', - }); - }); - it('createText does not strip visible characters', () => { - const text = 'hello world 你好   ​'; + const text = 'hello world 你好 ​'; const result = createText(text); expect(result).toEqual({ segmentType: 'Text', format: {}, - text: 'hello world 你好   ​', + text: 'hello world 你好 ​', }); }); diff --git a/packages/roosterjs-content-model-types/lib/editor/ExperimentalFeature.ts b/packages/roosterjs-content-model-types/lib/editor/ExperimentalFeature.ts index 4b4e7f3be7a2..1d50197a8df3 100644 --- a/packages/roosterjs-content-model-types/lib/editor/ExperimentalFeature.ts +++ b/packages/roosterjs-content-model-types/lib/editor/ExperimentalFeature.ts @@ -64,4 +64,11 @@ export type ExperimentalFeature = /** * Transform the table border colors when switching from light to dark mode */ - | 'TransformTableBorderColors'; + | 'TransformTableBorderColors' + + /** + * Strip invisible unicode characters (U+E0000 to U+EFFFF) from text segments during DOM to Model conversion. + * These characters can be used to hide text in HTML and may cause unexpected behavior. + * @see https://embracethered.com/blog/posts/2024/hiding-and-finding-text-with-unicode-tags/ + */ + | 'FilterInvisibleUnicode'; From 5454f71fdd68de38863e820475480237544b3d88 Mon Sep 17 00:00:00 2001 From: jiuqingsong Date: Thu, 28 May 2026 10:03:42 -0700 Subject: [PATCH 3/4] Add dedicated unit tests for stripInvisibleUnicode Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../common/stripInvisibleUnicodeTest.ts | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 packages/roosterjs-content-model-dom/test/modelApi/common/stripInvisibleUnicodeTest.ts diff --git a/packages/roosterjs-content-model-dom/test/modelApi/common/stripInvisibleUnicodeTest.ts b/packages/roosterjs-content-model-dom/test/modelApi/common/stripInvisibleUnicodeTest.ts new file mode 100644 index 000000000000..bae02f32005d --- /dev/null +++ b/packages/roosterjs-content-model-dom/test/modelApi/common/stripInvisibleUnicodeTest.ts @@ -0,0 +1,46 @@ +import { stripInvisibleUnicode } from '../../../lib/modelApi/common/stripInvisibleUnicode'; + +describe('stripInvisibleUnicode', () => { + it('should strip invisible unicode characters in the tag range', () => { + expect(stripInvisibleUnicode('a\u{E0041}b\u{E0042}c')).toBe('abc'); + }); + + it('should strip all characters when input contains only invisible unicode', () => { + expect(stripInvisibleUnicode('\u{E0000}\u{E007F}\u{EFFFF}')).toBe(''); + }); + + it('should strip characters at range boundaries (U+E0000 and U+EFFFF)', () => { + expect(stripInvisibleUnicode('\u{DFFFF}start\u{E0000}mid\u{EFFFF}end\u{F0000}')).toBe( + '\u{DFFFF}startmidend\u{F0000}' + ); + }); + + it('should preserve meaningful invisible characters outside the tag range', () => { + // U+200B = Zero-Width Space, U+200D = Zero-Width Joiner, + // U+202E = Right-to-Left Override, U+202C = Pop Directional Formatting + const text = 'a\u{200B}b\u{200D}c\u{202E}d\u{202C}e'; + expect(stripInvisibleUnicode(text)).toBe(text); + }); + + it('should strip tag-range chars while keeping meaningful invisible chars', () => { + expect(stripInvisibleUnicode('a\u{200B}\u{E0041}b\u{202E}\u{E0042}c')).toBe( + 'a\u{200B}b\u{202E}c' + ); + }); + + it('should not modify visible characters', () => { + const text = 'hello world 你好'; + expect(stripInvisibleUnicode(text)).toBe(text); + }); + + it('should return empty string for empty input', () => { + expect(stripInvisibleUnicode('')).toBe(''); + }); + + it('should handle a long sequence of tag characters', () => { + const tags = Array.from({ length: 100 }, (_, i) => String.fromCodePoint(0xe0000 + i)).join( + '' + ); + expect(stripInvisibleUnicode('before' + tags + 'after')).toBe('beforeafter'); + }); +}); From 10b4e10c36d98e6bd79f4c55cf9be065f491a486 Mon Sep 17 00:00:00 2001 From: jiuqingsong Date: Thu, 28 May 2026 10:18:30 -0700 Subject: [PATCH 4/4] Mark stripInvisibleUnicode as @internal to fix build Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../lib/modelApi/common/stripInvisibleUnicode.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/roosterjs-content-model-dom/lib/modelApi/common/stripInvisibleUnicode.ts b/packages/roosterjs-content-model-dom/lib/modelApi/common/stripInvisibleUnicode.ts index 1b009fd41238..07e9858d369a 100644 --- a/packages/roosterjs-content-model-dom/lib/modelApi/common/stripInvisibleUnicode.ts +++ b/packages/roosterjs-content-model-dom/lib/modelApi/common/stripInvisibleUnicode.ts @@ -4,6 +4,7 @@ const INVISIBLE_UNICODE_REGEX = /[\u{E0000}-\u{EFFFF}]/gu; /** + * @internal * Strip invisible unicode characters from the given string * @param value The string to be processed * @returns The string with invisible unicode characters removed