Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Keeping whitespaces for text nodes #23

Merged
merged 1 commit into from
May 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions src/dom/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,10 @@ impl Dom {
if dom.tree_type == DomVariant::Empty {
dom.tree_type = DomVariant::DocumentFragment;
}
dom.children.push(Node::Text(pair.as_str().to_string()));
let text = pair.as_str().to_string();
if !text.trim().is_empty() {
mathiversen marked this conversation as resolved.
Show resolved Hide resolved
dom.children.push(Node::Text(text));
}
}

// Store comments as a child, but it doesn't affect the document type selection
Expand Down Expand Up @@ -244,7 +247,10 @@ impl Dom {
}
}
Rule::node_text | Rule::el_raw_text_content => {
element.children.push(Node::Text(pair.as_str().to_string()));
let text = pair.as_str().to_string();
if !text.trim().is_empty() {
mathiversen marked this conversation as resolved.
Show resolved Hide resolved
element.children.push(Node::Text(text));
}
}
Rule::node_comment => {
element
Expand Down Expand Up @@ -304,10 +310,10 @@ impl Dom {
for pair in pairs {
match pair.as_rule() {
Rule::attr_key => {
attribute.0 = pair.as_str().to_string();
attribute.0 = pair.as_str().trim().to_string();
}
Rule::attr_non_quoted => {
attribute.1 = Some(pair.as_str().to_string());
attribute.1 = Some(pair.as_str().trim().to_string());
}
Rule::attr_quoted => {
let inner_pair = pair
Expand Down
36 changes: 18 additions & 18 deletions src/grammar/rules.pest
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,23 @@ html = _{
//
// DOCTYPE
//
doctype = { chevron_left_bang ~ ^"doctype" ~ attr* ~ chevron_right_normal}
doctype = { WSP* ~ chevron_left_bang ~ ^"doctype" ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal}

//
// NODES
//
node = _{ node_comment | node_element | node_text }
node_comment = { comment_if | comment_normal }
node_text = { (!(node_element | comment_tag_start) ~ ANY)+ }
node_comment = { WSP* ~ (comment_if | comment_normal) ~ WSP* }
node_text = { (!(node_element | comment_tag_start | chevron_left_bang) ~ ANY)+ }
node_element = { el_void | el_void_xml | el_process_instruct | el_raw_text | el_normal | el_dangling }

//
// COMMENTS
//
comment_normal = _{ comment_tag_start ~ comment_body ~ comment_tag_end }
comment_body = { (!comment_tag_end ~ ANY)* }
comment_tag_start = _{ chevron_left_bang ~ "--" }
comment_tag_end = _{ "--" ~ chevron_right_normal }
comment_tag_start = _{ chevron_left_bang ~ "--" ~ WSP* }
comment_tag_end = _{ WSP* ~ "--" ~ chevron_right_normal }

// Compatability with old IE browsers... This is not necessary for newer browsers
comment_if = _{ comment_if_start ~ comment_if_body ~ comment_if_end }
Expand All @@ -39,11 +39,11 @@ comment_if_end = _{ chevron_left_bang ~ "[" ~ ^"endif" ~ "]" ~ comment_tag_end }
//
// ATTRIBUTES
//
attr = { attr_key ~ (equal ~ (attr_non_quoted | attr_quoted ))? }
attr = { attr_key ~ (equal ~ WSP* ~ (attr_non_quoted | attr_quoted ))? }
attr_quoted = ${PUSH(quote) ~ attr_value ~ POP }
attr_non_quoted = @{ !quote ~ (!(WHITESPACE | chevron_right) ~ ANY)* }
attr_key = { ASCII_ALPHA ~ text_chars* }
attr_value = { WHITESPACE* ~ (!PEEK ~ ANY)* ~ WHITESPACE* }
attr_non_quoted = @{ !quote ~ (!(WSP | chevron_right) ~ ANY)* }
attr_key = { WSP* ~ ASCII_ALPHA ~ text_chars* ~ WSP* }
attr_value = { WSP* ~ (!PEEK ~ ANY)* ~ WSP* }

//
// ELEMENTS
Expand Down Expand Up @@ -79,15 +79,15 @@ el_void_name_svg = @{
| ^"circle"
}
el_void_name = @{ el_void_name_html | el_void_name_svg }
el_void = _{ chevron_left_normal ~ el_void_name ~ attr* ~ (chevron_right_normal | chevron_right_closed) }
el_void_xml = _{ chevron_left_normal ~ el_name ~ attr* ~ chevron_right_closed }
el_void = _{ chevron_left_normal ~ WSP* ~ el_void_name ~ WSP* ~ attr* ~ WSP* ~ (chevron_right_normal | chevron_right_closed) }
el_void_xml = _{ chevron_left_normal ~ WSP* ~ el_name ~ WSP* ~ attr* ~ WSP* ~ chevron_right_closed }

// Open elements are default element that can take children
// and have both a start tag and an end tag
// Ex: <html lang="en"></html>
el_normal = _{ el_normal_start ~ (!el_normal_end ~ node)* ~ el_normal_end }
el_normal_start = _{ chevron_left_normal ~ PUSH(el_name) ~ attr* ~ chevron_right_normal}
el_normal_end = { chevron_left_closed ~ POP ~ chevron_right_normal}
el_normal_start = _{ chevron_left_normal ~ WSP* ~ PUSH(el_name) ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal}
el_normal_end = { chevron_left_closed ~ WSP* ~ POP ~ WSP* ~ chevron_right_normal}

// Raw text elements are elements with text/script content that
// might interfere with the normal html syntax
Expand All @@ -99,16 +99,16 @@ el_raw_text_name = {
}
el_raw_text_content = { (!el_raw_text_end ~ ANY)* }
el_raw_text = _{ el_raw_text_start ~ el_raw_text_content ~ el_raw_text_end }
el_raw_text_start = _{ chevron_left_normal ~ PUSH(el_raw_text_name) ~ attr* ~ chevron_right_normal}
el_raw_text_end = { chevron_left_closed ~ POP ~ chevron_right_normal}
el_raw_text_start = _{ chevron_left_normal ~ WSP* ~ PUSH(el_raw_text_name) ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal ~ WSP*}
el_raw_text_end = { WSP* ~ chevron_left_closed ~ WSP* ~ POP ~ WSP* ~ chevron_right_normal}

// XML processing instruction
// Ex: <?xml version="1.0" ?>
el_process_instruct = { chevron_left_question ~ el_name? ~ attr* ~ chevron_right_question }
el_process_instruct = { chevron_left_question ~ WSP* ~ el_name? ~ WSP* ~ attr* ~ WSP* ~ chevron_right_question }

// Catch dangling elements
// Ex: <div/></div>
el_dangling = { chevron_left_closed ~ el_name ~ chevron_right_normal}
el_dangling = { chevron_left_closed ~ WSP* ~ el_name ~ WSP* ~ chevron_right_normal}

//
// SYMBOLS / CHARACTERS
Expand All @@ -133,4 +133,4 @@ equal = _{ "=" }
quote_dubble = _{ "\"" }
quote_single = _{ "'" }
quote = _{ quote_dubble | quote_single }
WHITESPACE = _{ " " | "\t" | "\r" | "\n" }
WSP = _{ " " | "\t" | "\r" | "\n" }
27 changes: 27 additions & 0 deletions tests/element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -222,3 +222,30 @@ fn it_can_clone_dom() {
let dom_clone = dom.clone();
assert_eq!(dom, dom_clone);
}

#[test]
fn it_can_deal_with_weird_whitespaces() {
let html = indoc!(
"
<!-- Normal case -->
<div> Text </div>

<!-- Whitespaces in opening tag to the left -->
< div> Text </div>

<!-- Whitespaces in opening tag to the right -->
<div > Text </div>

<!-- Whitespaces in closing tag to the left (should not work) -->
<div> Text < /div>
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cases where </ is separated by a whitespace like < / do not work in browsers as well, so I just didn't make them work either


<!-- Whitespaces in closing tag to the right -->
<div> Text </div >

<!-- Whitespaces everywhere (should not work) -->
< div > Text < / div >
"
);
let dom = Dom::parse(html).unwrap();
assert_json_snapshot!(dom);
}
7 changes: 7 additions & 0 deletions tests/element_attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ fn it_can_parse_multiple_attributes_single_quote() -> Result<()> {
Ok(())
}
#[test]
fn it_can_parse_multiple_attributes_where_whitespace_does_not_matter_for_keys() -> Result<()> {
let html = "<div cat = \"mjau\" dog =' woff 'ape = oh ></div>";
let dom = Dom::parse(html)?;
assert_json_snapshot!(dom);
Ok(())
}
#[test]
fn it_can_parse_multiple_attributes_double_quote() -> Result<()> {
let html = "<div cat=\"mjau\" dog=\"woff\" ape=\"oh\"></div>";
let dom = Dom::parse(html)?;
Expand Down
45 changes: 45 additions & 0 deletions tests/snapshots/element__it_can_deal_with_weird_whitespaces.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
---
source: tests/element.rs
expression: dom
---
{
"treeType": "documentFragment",
"children": [
"Normal case",
{
"name": "div",
"variant": "normal",
"children": [
" Text "
]
},
"Whitespaces in opening tag to the left",
{
"name": "div",
"variant": "normal",
"children": [
" Text "
]
},
"Whitespaces in opening tag to the right",
{
"name": "div",
"variant": "normal",
"children": [
" Text "
]
},
"Whitespaces in closing tag to the left (should not work)",
"<div> Text < /div>\n\n",
"Whitespaces in closing tag to the right",
{
"name": "div",
"variant": "normal",
"children": [
" Text "
]
},
"Whitespaces everywhere (should not work)",
"< div > Text < / div >\n"
]
}
2 changes: 1 addition & 1 deletion tests/snapshots/element__it_can_parse_deeply_nested.snap
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ expression: dom
],
"children": [
"this is deep",
"hello world"
"hello world\n "
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some cases where we get this ugly output. But this seems to be the same thing browsers do when you try to edit text nodes where they also seem to include line breaks and stuff.

Altogether I think this is the better behavior as you can just trim the result if you don't need the whitespaces.

]
}
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ expression: dom
"name": "div",
"variant": "void"
},
"Hello",
"\n Hello\n ",
{
"name": "div",
"variant": "normal",
"children": [
"World"
"\n World\n "
]
}
]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
---
source: tests/element_attributes.rs
expression: dom
---
{
"treeType": "documentFragment",
"children": [
{
"name": "div",
"variant": "normal",
"attributes": {
"ape": "oh",
"cat": "mjau",
"dog": " woff "
}
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ expression: dom
{
"treeType": "documentFragment",
"children": [
"hello world\nhere's another line for you!",
"hello world\nhere's another line for you!\n",
{
"name": "div",
"variant": "void"
},
"The end"
"\nThe end\n"
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ expression: dom
{
"treeType": "documentFragment",
"children": [
"hello world\nhere's another line for you!\nThe end"
"hello world\nhere's another line for you!\nThe end\n"
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
---
source: tests/text.rs
expression: dom
---
{
"treeType": "documentFragment",
"children": [
{
"name": "p",
"variant": "normal",
"children": [
"\n This is a ",
{
"name": "b",
"variant": "normal",
"children": [
"para"
]
},
"gra",
{
"name": "b",
"variant": "normal",
"children": [
"ph"
]
},
" with some",
{
"name": "i",
"variant": "normal",
"children": [
" weird "
]
},
" formatting.\n"
]
}
]
}
12 changes: 12 additions & 0 deletions tests/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,15 @@ fn it_can_parse_text_with_chevron() -> Result<()> {
assert_json_snapshot!(dom);
Ok(())
}

#[test]
fn it_can_parse_text_in_paragraph_with_weird_formatting() -> Result<()> {
let html = indoc!(r"
<p>
This is a <b>para</b>gra<b>ph</b> with some<i> weird </i> formatting.
</p>
");
let dom = Dom::parse(html)?;
assert_json_snapshot!(dom);
Ok(())
}