Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { createText } from '../creators/createText';
import { ensureParagraph } from './ensureParagraph';
import { hasSpacesOnly } from './hasSpacesOnly';
import { isWhiteSpacePreserved } from '../../domUtils/isWhiteSpacePreserved';
import { stripInvisibleUnicode } from './stripInvisibleUnicode';
import type {
ContentModelBlockGroup,
ContentModelText,
Expand Down Expand Up @@ -32,7 +33,13 @@ export function addTextSegment(
(paragraph?.segments.length ?? 0) > 0 ||
isWhiteSpacePreserved(paragraph?.format.whiteSpace)
) {
textModel = createText(text, context.segmentFormat);
const filteredText =
context.experimentalFeatures &&
context.experimentalFeatures.indexOf('FilterInvisibleUnicode') > -1
? stripInvisibleUnicode(text)
: text;

textModel = createText(filteredText, context.segmentFormat);

if (context.isInSelection) {
textModel.isSelected = true;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// According to https://embracethered.com/blog/posts/2024/hiding-and-finding-text-with-unicode-tags/
// there are some invisible unicode characters in the range of U+E0000 to U+EFFFF, which are used for hiding text in HTML.
// We need to strip them out before processing the pasted content, otherwise they will be treated as normal text and cause unexpected behavior.
const INVISIBLE_UNICODE_REGEX = /[\u{E0000}-\u{EFFFF}]/gu;

/**
* @internal
* Strip invisible unicode characters from the given string
* @param value The string to be processed
* @returns The string with invisible unicode characters removed
*/
export function stripInvisibleUnicode(value: string): string {
return value.replace(INVISIBLE_UNICODE_REGEX, '');
}
71 changes: 71 additions & 0 deletions packages/roosterjs-content-model-dom/test/endToEndTest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3271,6 +3271,77 @@ describe('End to end test for DOM => Model => DOM/TEXT', () => {
);
});

it('Text with invisible unicode tag characters is stripped when FilterInvisibleUnicode feature is enabled', () => {
// Source HTML contains U+E0041 / U+E0042 (unicode tag range — must be stripped)
// mixed with U+200B (ZWSP), U+200D (ZWJ), U+202E (RLO), U+202C (PDF)
// which must be preserved.
const div1 = document.createElement('div');
div1.innerHTML = '<p>a\u{E0041}b\u{200B}c\u{E0042}d\u{202E}evil\u{202C}e</p>';

const model = domToContentModel(
div1,
createDomToModelContext({ experimentalFeatures: ['FilterInvisibleUnicode'] })
);

expect(model).toEqual({
blockGroupType: 'Document',
blocks: [
{
blockType: 'Paragraph',
segments: [
{
segmentType: 'Text',
text: 'ab\u{200B}cd\u{202E}evil\u{202C}e',
format: {},
},
],
format: {
marginTop: '1em',
marginBottom: '1em',
},
decorator: {
tagName: 'p',
format: {},
},
},
],
});

const text = contentModelToText(model);
expect(text).toBe('ab\u{200B}cd\u{202E}evil\u{202C}e');
});

it('Text with invisible unicode tag characters is NOT stripped when feature is disabled', () => {
const div1 = document.createElement('div');
div1.innerHTML = '<p>a\u{E0041}b\u{E0042}c</p>';

const model = domToContentModel(div1, createDomToModelContext());

expect(model).toEqual({
blockGroupType: 'Document',
blocks: [
{
blockType: 'Paragraph',
segments: [
{
segmentType: 'Text',
text: 'a\u{E0041}b\u{E0042}c',
format: {},
},
],
format: {
marginTop: '1em',
marginBottom: '1em',
},
decorator: {
tagName: 'p',
format: {},
},
},
],
});
});

it('LI without UL followed by other blocks', () => {
runTest(
'<li>test</li><div>other</div>',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,4 +206,56 @@ describe('addTextSegment', () => {
],
});
});

it('Add text with invisible unicode, feature enabled', () => {
const group = createContentModelDocument();
const context = createDomToModelContext({
experimentalFeatures: ['FilterInvisibleUnicode'],
});

addTextSegment(group, 'a\u{E0041}b\u{E0042}c', context);

expect(group).toEqual({
blockGroupType: 'Document',
blocks: [
{
blockType: 'Paragraph',
format: {},
segments: [
{
segmentType: 'Text',
text: 'abc',
format: {},
},
],
isImplicit: true,
},
],
});
});

it('Add text with invisible unicode, feature disabled', () => {
const group = createContentModelDocument();
const context = createDomToModelContext();

addTextSegment(group, 'a\u{E0041}b\u{E0042}c', context);

expect(group).toEqual({
blockGroupType: 'Document',
blocks: [
{
blockType: 'Paragraph',
format: {},
segments: [
{
segmentType: 'Text',
text: 'a\u{E0041}b\u{E0042}c',
format: {},
},
],
isImplicit: true,
},
],
});
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import { stripInvisibleUnicode } from '../../../lib/modelApi/common/stripInvisibleUnicode';

describe('stripInvisibleUnicode', () => {
it('should strip invisible unicode characters in the tag range', () => {
expect(stripInvisibleUnicode('a\u{E0041}b\u{E0042}c')).toBe('abc');
});

it('should strip all characters when input contains only invisible unicode', () => {
expect(stripInvisibleUnicode('\u{E0000}\u{E007F}\u{EFFFF}')).toBe('');
});

it('should strip characters at range boundaries (U+E0000 and U+EFFFF)', () => {
expect(stripInvisibleUnicode('\u{DFFFF}start\u{E0000}mid\u{EFFFF}end\u{F0000}')).toBe(
'\u{DFFFF}startmidend\u{F0000}'
);
});

it('should preserve meaningful invisible characters outside the tag range', () => {
// U+200B = Zero-Width Space, U+200D = Zero-Width Joiner,
// U+202E = Right-to-Left Override, U+202C = Pop Directional Formatting
const text = 'a\u{200B}b\u{200D}c\u{202E}d\u{202C}e';
expect(stripInvisibleUnicode(text)).toBe(text);
});

it('should strip tag-range chars while keeping meaningful invisible chars', () => {
expect(stripInvisibleUnicode('a\u{200B}\u{E0041}b\u{202E}\u{E0042}c')).toBe(
'a\u{200B}b\u{202E}c'
);
});

it('should not modify visible characters', () => {
const text = 'hello world 你好';
expect(stripInvisibleUnicode(text)).toBe(text);
});

it('should return empty string for empty input', () => {
expect(stripInvisibleUnicode('')).toBe('');
});

it('should handle a long sequence of tag characters', () => {
const tags = Array.from({ length: 100 }, (_, i) => String.fromCodePoint(0xe0000 + i)).join(
''
);
expect(stripInvisibleUnicode('before' + tags + 'after')).toBe('beforeafter');
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,63 @@ describe('Creators', () => {
});
});

it('createText with invisible unicode characters does not strip by default', () => {
const text = 'a\u{E0041}b\u{E0042}c';
const result = createText(text);

expect(result).toEqual({
segmentType: 'Text',
format: {},
text: 'a\u{E0041}b\u{E0042}c',
});
});

it('createText with only invisible unicode characters does not strip by default', () => {
const text = '\u{E0000}\u{E007F}\u{EFFFF}';
const result = createText(text);

expect(result).toEqual({
segmentType: 'Text',
format: {},
text: '\u{E0000}\u{E007F}\u{EFFFF}',
});
});

it('createText with invisible unicode at boundary range does not strip by default', () => {
const text = '\u{DFFFF}start\u{E0000}mid\u{EFFFF}end\u{F0000}';
const result = createText(text);

expect(result).toEqual({
segmentType: 'Text',
format: {},
text: '\u{DFFFF}start\u{E0000}mid\u{EFFFF}end\u{F0000}',
});
});

it('createText preserves meaningful invisible characters outside the tag range', () => {
// ​ = Zero-Width Space, ‍ = Zero-Width Joiner,
// ‮ = Right-to-Left Override, ‬ = Pop Directional Formatting
const text = 'a​b‍c‮d‬e';
const result = createText(text);

expect(result).toEqual({
segmentType: 'Text',
format: {},
text: 'a​b‍c‮d‬e',
});
});

it('createText does not strip visible characters', () => {
const text = 'hello world 你好 ​';
const result = createText(text);

expect(result).toEqual({
segmentType: 'Text',
format: {},
text: 'hello world 你好 ​',
});
});

it('createTableRow', () => {
const row = createTableRow();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,11 @@ export type ExperimentalFeature =
/**
* Transform the table border colors when switching from light to dark mode
*/
| 'TransformTableBorderColors';
| 'TransformTableBorderColors'

/**
* Strip invisible unicode characters (U+E0000 to U+EFFFF) from text segments during DOM to Model conversion.
* These characters can be used to hide text in HTML and may cause unexpected behavior.
* @see https://embracethered.com/blog/posts/2024/hiding-and-finding-text-with-unicode-tags/
*/
| 'FilterInvisibleUnicode';
Loading