diff --git a/packages/roosterjs-content-model-dom/lib/modelApi/common/addTextSegment.ts b/packages/roosterjs-content-model-dom/lib/modelApi/common/addTextSegment.ts index dc8536468de5..4a93d275148c 100644 --- a/packages/roosterjs-content-model-dom/lib/modelApi/common/addTextSegment.ts +++ b/packages/roosterjs-content-model-dom/lib/modelApi/common/addTextSegment.ts @@ -4,6 +4,7 @@ import { createText } from '../creators/createText'; import { ensureParagraph } from './ensureParagraph'; import { hasSpacesOnly } from './hasSpacesOnly'; import { isWhiteSpacePreserved } from '../../domUtils/isWhiteSpacePreserved'; +import { stripInvisibleUnicode } from './stripInvisibleUnicode'; import type { ContentModelBlockGroup, ContentModelText, @@ -32,7 +33,13 @@ export function addTextSegment( (paragraph?.segments.length ?? 0) > 0 || isWhiteSpacePreserved(paragraph?.format.whiteSpace) ) { - textModel = createText(text, context.segmentFormat); + const filteredText = + context.experimentalFeatures && + context.experimentalFeatures.indexOf('FilterInvisibleUnicode') > -1 + ? stripInvisibleUnicode(text) + : text; + + textModel = createText(filteredText, context.segmentFormat); if (context.isInSelection) { textModel.isSelected = true; diff --git a/packages/roosterjs-content-model-dom/lib/modelApi/common/stripInvisibleUnicode.ts b/packages/roosterjs-content-model-dom/lib/modelApi/common/stripInvisibleUnicode.ts new file mode 100644 index 000000000000..07e9858d369a --- /dev/null +++ b/packages/roosterjs-content-model-dom/lib/modelApi/common/stripInvisibleUnicode.ts @@ -0,0 +1,14 @@ +// According to https://embracethered.com/blog/posts/2024/hiding-and-finding-text-with-unicode-tags/ +// there are some invisible unicode characters in the range of U+E0000 to U+EFFFF, which are used for hiding text in HTML. +// We need to strip them out before processing the pasted content, otherwise they will be treated as normal text and cause unexpected behavior. +const INVISIBLE_UNICODE_REGEX = /[\u{E0000}-\u{EFFFF}]/gu; + +/** + * @internal + * Strip invisible unicode characters from the given string + * @param value The string to be processed + * @returns The string with invisible unicode characters removed + */ +export function stripInvisibleUnicode(value: string): string { + return value.replace(INVISIBLE_UNICODE_REGEX, ''); +} diff --git a/packages/roosterjs-content-model-dom/test/endToEndTest.ts b/packages/roosterjs-content-model-dom/test/endToEndTest.ts index 91a11f330c2b..87807d48cbdc 100644 --- a/packages/roosterjs-content-model-dom/test/endToEndTest.ts +++ b/packages/roosterjs-content-model-dom/test/endToEndTest.ts @@ -3271,6 +3271,77 @@ describe('End to end test for DOM => Model => DOM/TEXT', () => { ); }); + it('Text with invisible unicode tag characters is stripped when FilterInvisibleUnicode feature is enabled', () => { + // Source HTML contains U+E0041 / U+E0042 (unicode tag range — must be stripped) + // mixed with U+200B (ZWSP), U+200D (ZWJ), U+202E (RLO), U+202C (PDF) + // which must be preserved. + const div1 = document.createElement('div'); + div1.innerHTML = '

a\u{E0041}b\u{200B}c\u{E0042}d\u{202E}evil\u{202C}e

'; + + const model = domToContentModel( + div1, + createDomToModelContext({ experimentalFeatures: ['FilterInvisibleUnicode'] }) + ); + + expect(model).toEqual({ + blockGroupType: 'Document', + blocks: [ + { + blockType: 'Paragraph', + segments: [ + { + segmentType: 'Text', + text: 'ab\u{200B}cd\u{202E}evil\u{202C}e', + format: {}, + }, + ], + format: { + marginTop: '1em', + marginBottom: '1em', + }, + decorator: { + tagName: 'p', + format: {}, + }, + }, + ], + }); + + const text = contentModelToText(model); + expect(text).toBe('ab\u{200B}cd\u{202E}evil\u{202C}e'); + }); + + it('Text with invisible unicode tag characters is NOT stripped when feature is disabled', () => { + const div1 = document.createElement('div'); + div1.innerHTML = '

a\u{E0041}b\u{E0042}c

'; + + const model = domToContentModel(div1, createDomToModelContext()); + + expect(model).toEqual({ + blockGroupType: 'Document', + blocks: [ + { + blockType: 'Paragraph', + segments: [ + { + segmentType: 'Text', + text: 'a\u{E0041}b\u{E0042}c', + format: {}, + }, + ], + format: { + marginTop: '1em', + marginBottom: '1em', + }, + decorator: { + tagName: 'p', + format: {}, + }, + }, + ], + }); + }); + it('LI without UL followed by other blocks', () => { runTest( '
  • test
  • other
    ', diff --git a/packages/roosterjs-content-model-dom/test/modelApi/common/addTextSegmentTest.ts b/packages/roosterjs-content-model-dom/test/modelApi/common/addTextSegmentTest.ts index c3ccabd1dfa1..9d83c04ca885 100644 --- a/packages/roosterjs-content-model-dom/test/modelApi/common/addTextSegmentTest.ts +++ b/packages/roosterjs-content-model-dom/test/modelApi/common/addTextSegmentTest.ts @@ -206,4 +206,56 @@ describe('addTextSegment', () => { ], }); }); + + it('Add text with invisible unicode, feature enabled', () => { + const group = createContentModelDocument(); + const context = createDomToModelContext({ + experimentalFeatures: ['FilterInvisibleUnicode'], + }); + + addTextSegment(group, 'a\u{E0041}b\u{E0042}c', context); + + expect(group).toEqual({ + blockGroupType: 'Document', + blocks: [ + { + blockType: 'Paragraph', + format: {}, + segments: [ + { + segmentType: 'Text', + text: 'abc', + format: {}, + }, + ], + isImplicit: true, + }, + ], + }); + }); + + it('Add text with invisible unicode, feature disabled', () => { + const group = createContentModelDocument(); + const context = createDomToModelContext(); + + addTextSegment(group, 'a\u{E0041}b\u{E0042}c', context); + + expect(group).toEqual({ + blockGroupType: 'Document', + blocks: [ + { + blockType: 'Paragraph', + format: {}, + segments: [ + { + segmentType: 'Text', + text: 'a\u{E0041}b\u{E0042}c', + format: {}, + }, + ], + isImplicit: true, + }, + ], + }); + }); }); diff --git a/packages/roosterjs-content-model-dom/test/modelApi/common/stripInvisibleUnicodeTest.ts b/packages/roosterjs-content-model-dom/test/modelApi/common/stripInvisibleUnicodeTest.ts new file mode 100644 index 000000000000..bae02f32005d --- /dev/null +++ b/packages/roosterjs-content-model-dom/test/modelApi/common/stripInvisibleUnicodeTest.ts @@ -0,0 +1,46 @@ +import { stripInvisibleUnicode } from '../../../lib/modelApi/common/stripInvisibleUnicode'; + +describe('stripInvisibleUnicode', () => { + it('should strip invisible unicode characters in the tag range', () => { + expect(stripInvisibleUnicode('a\u{E0041}b\u{E0042}c')).toBe('abc'); + }); + + it('should strip all characters when input contains only invisible unicode', () => { + expect(stripInvisibleUnicode('\u{E0000}\u{E007F}\u{EFFFF}')).toBe(''); + }); + + it('should strip characters at range boundaries (U+E0000 and U+EFFFF)', () => { + expect(stripInvisibleUnicode('\u{DFFFF}start\u{E0000}mid\u{EFFFF}end\u{F0000}')).toBe( + '\u{DFFFF}startmidend\u{F0000}' + ); + }); + + it('should preserve meaningful invisible characters outside the tag range', () => { + // U+200B = Zero-Width Space, U+200D = Zero-Width Joiner, + // U+202E = Right-to-Left Override, U+202C = Pop Directional Formatting + const text = 'a\u{200B}b\u{200D}c\u{202E}d\u{202C}e'; + expect(stripInvisibleUnicode(text)).toBe(text); + }); + + it('should strip tag-range chars while keeping meaningful invisible chars', () => { + expect(stripInvisibleUnicode('a\u{200B}\u{E0041}b\u{202E}\u{E0042}c')).toBe( + 'a\u{200B}b\u{202E}c' + ); + }); + + it('should not modify visible characters', () => { + const text = 'hello world 你好'; + expect(stripInvisibleUnicode(text)).toBe(text); + }); + + it('should return empty string for empty input', () => { + expect(stripInvisibleUnicode('')).toBe(''); + }); + + it('should handle a long sequence of tag characters', () => { + const tags = Array.from({ length: 100 }, (_, i) => String.fromCodePoint(0xe0000 + i)).join( + '' + ); + expect(stripInvisibleUnicode('before' + tags + 'after')).toBe('beforeafter'); + }); +}); diff --git a/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts b/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts index 2429877479b3..21dae0216a4e 100644 --- a/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts +++ b/packages/roosterjs-content-model-dom/test/modelApi/creators/creatorsTest.ts @@ -233,6 +233,63 @@ describe('Creators', () => { }); }); + it('createText with invisible unicode characters does not strip by default', () => { + const text = 'a\u{E0041}b\u{E0042}c'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: 'a\u{E0041}b\u{E0042}c', + }); + }); + + it('createText with only invisible unicode characters does not strip by default', () => { + const text = '\u{E0000}\u{E007F}\u{EFFFF}'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: '\u{E0000}\u{E007F}\u{EFFFF}', + }); + }); + + it('createText with invisible unicode at boundary range does not strip by default', () => { + const text = '\u{DFFFF}start\u{E0000}mid\u{EFFFF}end\u{F0000}'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: '\u{DFFFF}start\u{E0000}mid\u{EFFFF}end\u{F0000}', + }); + }); + + it('createText preserves meaningful invisible characters outside the tag range', () => { + // ​ = Zero-Width Space, ‍ = Zero-Width Joiner, + // ‮ = Right-to-Left Override, ‬ = Pop Directional Formatting + const text = 'a​b‍c‮d‬e'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: 'a​b‍c‮d‬e', + }); + }); + + it('createText does not strip visible characters', () => { + const text = 'hello world 你好 ​'; + const result = createText(text); + + expect(result).toEqual({ + segmentType: 'Text', + format: {}, + text: 'hello world 你好 ​', + }); + }); + it('createTableRow', () => { const row = createTableRow(); diff --git a/packages/roosterjs-content-model-types/lib/editor/ExperimentalFeature.ts b/packages/roosterjs-content-model-types/lib/editor/ExperimentalFeature.ts index 4b4e7f3be7a2..1d50197a8df3 100644 --- a/packages/roosterjs-content-model-types/lib/editor/ExperimentalFeature.ts +++ b/packages/roosterjs-content-model-types/lib/editor/ExperimentalFeature.ts @@ -64,4 +64,11 @@ export type ExperimentalFeature = /** * Transform the table border colors when switching from light to dark mode */ - | 'TransformTableBorderColors'; + | 'TransformTableBorderColors' + + /** + * Strip invisible unicode characters (U+E0000 to U+EFFFF) from text segments during DOM to Model conversion. + * These characters can be used to hide text in HTML and may cause unexpected behavior. + * @see https://embracethered.com/blog/posts/2024/hiding-and-finding-text-with-unicode-tags/ + */ + | 'FilterInvisibleUnicode';