mathiversen · mathiversen · May 11, 2023 · Jul 18, 2022 · bennyboer · Jul 18, 2022
diff --git a/src/dom/mod.rs b/src/dom/mod.rs
@@ -128,7 +128,10 @@ impl Dom {
                     if dom.tree_type == DomVariant::Empty {
                         dom.tree_type = DomVariant::DocumentFragment;
                     }
-                    dom.children.push(Node::Text(pair.as_str().to_string()));
+                    let text = pair.as_str().to_string();
+                    if !text.trim().is_empty() {
+                        dom.children.push(Node::Text(text));
+                    }
                 }
 
                 // Store comments as a child, but it doesn't affect the document type selection
@@ -244,7 +247,10 @@ impl Dom {
                     }
                 }
                 Rule::node_text | Rule::el_raw_text_content => {
-                    element.children.push(Node::Text(pair.as_str().to_string()));
+                    let text = pair.as_str().to_string();
+                    if !text.trim().is_empty() {
+                        element.children.push(Node::Text(text));
+                    }
                 }
                 Rule::node_comment => {
                     element
@@ -304,10 +310,10 @@ impl Dom {
         for pair in pairs {
             match pair.as_rule() {
                 Rule::attr_key => {
-                    attribute.0 = pair.as_str().to_string();
+                    attribute.0 = pair.as_str().trim().to_string();
                 }
                 Rule::attr_non_quoted => {
-                    attribute.1 = Some(pair.as_str().to_string());
+                    attribute.1 = Some(pair.as_str().trim().to_string());
                 }
                 Rule::attr_quoted => {
                     let inner_pair = pair

diff --git a/src/grammar/rules.pest b/src/grammar/rules.pest
@@ -12,23 +12,23 @@ html = _{
 //
 // DOCTYPE
 //
-doctype = { chevron_left_bang ~ ^"doctype" ~ attr* ~ chevron_right_normal}
+doctype = { WSP* ~ chevron_left_bang ~ ^"doctype" ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal}
 
 //
 // NODES
 //
 node = _{ node_comment | node_element | node_text }
-node_comment = { comment_if | comment_normal }
-node_text = { (!(node_element | comment_tag_start) ~ ANY)+ }
+node_comment = { WSP* ~ (comment_if | comment_normal) ~ WSP* }
+node_text = { (!(node_element | comment_tag_start | chevron_left_bang) ~ ANY)+ }
 node_element = { el_void | el_void_xml | el_process_instruct | el_raw_text | el_normal | el_dangling }
 
 //
 // COMMENTS
 //
 comment_normal = _{ comment_tag_start ~ comment_body ~ comment_tag_end }
 comment_body = { (!comment_tag_end ~ ANY)* }
-comment_tag_start = _{ chevron_left_bang ~ "--" }
-comment_tag_end = _{ "--" ~ chevron_right_normal }
+comment_tag_start = _{ chevron_left_bang ~ "--" ~ WSP* }
+comment_tag_end = _{ WSP* ~ "--" ~ chevron_right_normal }
 
 // Compatability with old IE browsers... This is not necessary for newer browsers
 comment_if = _{ comment_if_start ~ comment_if_body ~ comment_if_end }
@@ -39,11 +39,11 @@ comment_if_end = _{ chevron_left_bang ~ "[" ~ ^"endif" ~ "]" ~ comment_tag_end }
 //
 // ATTRIBUTES
 //
-attr = { attr_key ~ (equal ~ (attr_non_quoted | attr_quoted ))? }
+attr = { attr_key ~ (equal ~ WSP* ~ (attr_non_quoted | attr_quoted ))? }
 attr_quoted =  ${PUSH(quote) ~ attr_value ~ POP }
-attr_non_quoted = @{ !quote ~ (!(WHITESPACE | chevron_right) ~ ANY)* }
-attr_key = { ASCII_ALPHA ~ text_chars* }
-attr_value = { WHITESPACE* ~ (!PEEK ~ ANY)* ~ WHITESPACE* }
+attr_non_quoted = @{ !quote ~ (!(WSP | chevron_right) ~ ANY)* }
+attr_key = { WSP* ~ ASCII_ALPHA ~ text_chars* ~ WSP* }
+attr_value = { WSP* ~ (!PEEK ~ ANY)* ~ WSP* }
 
 //
 // ELEMENTS
@@ -79,15 +79,15 @@ el_void_name_svg = @{
     | ^"circle"
 }
 el_void_name = @{ el_void_name_html | el_void_name_svg }
-el_void = _{ chevron_left_normal ~ el_void_name ~ attr* ~ (chevron_right_normal | chevron_right_closed) }
-el_void_xml = _{ chevron_left_normal ~ el_name ~ attr* ~ chevron_right_closed }
+el_void = _{ chevron_left_normal ~ WSP* ~ el_void_name ~ WSP* ~ attr* ~ WSP* ~ (chevron_right_normal | chevron_right_closed) }
+el_void_xml = _{ chevron_left_normal ~ WSP* ~ el_name ~ WSP* ~ attr* ~ WSP* ~ chevron_right_closed }
 
 // Open elements are default element that can take children 
 // and have both a start tag and an end tag
 // Ex: <html lang="en"></html>
 el_normal = _{ el_normal_start ~ (!el_normal_end ~ node)* ~ el_normal_end }
-el_normal_start = _{ chevron_left_normal ~ PUSH(el_name) ~ attr* ~ chevron_right_normal}
-el_normal_end = { chevron_left_closed ~ POP ~ chevron_right_normal}
+el_normal_start = _{ chevron_left_normal ~ WSP* ~ PUSH(el_name) ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal}
+el_normal_end = { chevron_left_closed ~ WSP* ~ POP ~ WSP* ~ chevron_right_normal}
 
 // Raw text elements are elements with text/script content that
 // might interfere with the normal html syntax
@@ -99,16 +99,16 @@ el_raw_text_name = {
 }
 el_raw_text_content = { (!el_raw_text_end ~ ANY)* }
 el_raw_text = _{ el_raw_text_start ~ el_raw_text_content ~ el_raw_text_end }
-el_raw_text_start = _{ chevron_left_normal ~ PUSH(el_raw_text_name) ~ attr* ~ chevron_right_normal}
-el_raw_text_end = { chevron_left_closed ~ POP ~ chevron_right_normal}
+el_raw_text_start = _{ chevron_left_normal ~ WSP* ~ PUSH(el_raw_text_name) ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal ~ WSP*}
+el_raw_text_end = { WSP* ~ chevron_left_closed ~ WSP* ~ POP ~ WSP* ~ chevron_right_normal}
 
 // XML processing instruction
 // Ex: <?xml version="1.0" ?>
-el_process_instruct = { chevron_left_question ~ el_name? ~ attr* ~ chevron_right_question }
+el_process_instruct = { chevron_left_question ~ WSP* ~ el_name? ~ WSP* ~ attr* ~ WSP* ~ chevron_right_question }
 
 // Catch dangling elements
 // Ex: <div/></div>
-el_dangling = { chevron_left_closed ~ el_name ~ chevron_right_normal}
+el_dangling = { chevron_left_closed ~ WSP* ~ el_name ~ WSP* ~ chevron_right_normal}
 
 //
 // SYMBOLS / CHARACTERS
@@ -133,4 +133,4 @@ equal = _{ "=" }
 quote_dubble = _{ "\"" }
 quote_single = _{ "'" }
 quote = _{ quote_dubble | quote_single }
-WHITESPACE = _{ " " | "\t" | "\r" | "\n" }
+WSP = _{ " " | "\t" | "\r" | "\n" }
diff --git a/tests/element.rs b/tests/element.rs
@@ -222,3 +222,30 @@ fn it_can_clone_dom() {
     let dom_clone = dom.clone();
     assert_eq!(dom, dom_clone);
 }
+
+#[test]
+fn it_can_deal_with_weird_whitespaces() {
+    let html = indoc!(
+        "
+        <!-- Normal case -->
+        <div> Text </div>
+
+        <!-- Whitespaces in opening tag to the left -->
+        < div> Text </div>
+
+        <!-- Whitespaces in opening tag to the right -->
+        <div > Text </div>
+
+        <!-- Whitespaces in closing tag to the left (should not work) -->
+        <div> Text < /div>
+
+        <!-- Whitespaces in closing tag to the right -->
+        <div> Text </div >
+
+        <!-- Whitespaces everywhere (should not work) -->
+        < div > Text < / div >
+        "
+    );
+    let dom = Dom::parse(html).unwrap();
+    assert_json_snapshot!(dom);
+}
diff --git a/tests/element_attributes.rs b/tests/element_attributes.rs
@@ -37,6 +37,13 @@ fn it_can_parse_multiple_attributes_single_quote() -> Result<()> {
     Ok(())
 }
 #[test]
+fn it_can_parse_multiple_attributes_where_whitespace_does_not_matter_for_keys() -> Result<()> {
+    let html = "<div    cat   =  \"mjau\" dog ='  woff  'ape = oh ></div>";
+    let dom = Dom::parse(html)?;
+    assert_json_snapshot!(dom);
+    Ok(())
+}
+#[test]
 fn it_can_parse_multiple_attributes_double_quote() -> Result<()> {
     let html = "<div cat=\"mjau\" dog=\"woff\" ape=\"oh\"></div>";
     let dom = Dom::parse(html)?;

diff --git a/tests/snapshots/element__it_can_deal_with_weird_whitespaces.snap b/tests/snapshots/element__it_can_deal_with_weird_whitespaces.snap
@@ -0,0 +1,45 @@
+---
+source: tests/element.rs
+expression: dom
+---
+{
+  "treeType": "documentFragment",
+  "children": [
+    "Normal case",
+    {
+      "name": "div",
+      "variant": "normal",
+      "children": [
+        " Text "
+      ]
+    },
+    "Whitespaces in opening tag to the left",
+    {
+      "name": "div",
+      "variant": "normal",
+      "children": [
+        " Text "
+      ]
+    },
+    "Whitespaces in opening tag to the right",
+    {
+      "name": "div",
+      "variant": "normal",
+      "children": [
+        " Text "
+      ]
+    },
+    "Whitespaces in closing tag to the left (should not work)",
+    "<div> Text < /div>\n\n",
+    "Whitespaces in closing tag to the right",
+    {
+      "name": "div",
+      "variant": "normal",
+      "children": [
+        " Text "
+      ]
+    },
+    "Whitespaces everywhere (should not work)",
+    "< div > Text < / div >\n"
+  ]
+}
diff --git a/tests/snapshots/element__it_can_parse_deeply_nested.snap b/tests/snapshots/element__it_can_parse_deeply_nested.snap
@@ -62,7 +62,7 @@ expression: dom
                                   ],
                                   "children": [
                                     "this is deep",
-                                    "hello world"
+                                    "hello world\n                            "
                                   ]
                                 }
                               ]

diff --git a/tests/snapshots/element__it_can_parse_nested_elements_mixed_children.snap b/tests/snapshots/element__it_can_parse_nested_elements_mixed_children.snap
@@ -14,12 +14,12 @@ expression: dom
           "name": "div",
           "variant": "void"
         },
-        "Hello",
+        "\n    Hello\n    ",
         {
           "name": "div",
           "variant": "normal",
           "children": [
-            "World"
+            "\n        World\n    "
           ]
         }
       ]

diff --git a/...tributes__it_can_parse_multiple_attributes_where_whitespace_does_not_matter_for_keys.snap b/...tributes__it_can_parse_multiple_attributes_where_whitespace_does_not_matter_for_keys.snap
@@ -0,0 +1,18 @@
+---
+source: tests/element_attributes.rs
+expression: dom
+---
+{
+  "treeType": "documentFragment",
+  "children": [
+    {
+      "name": "div",
+      "variant": "normal",
+      "attributes": {
+        "ape": "oh",
+        "cat": "mjau",
+        "dog": "  woff  "
+      }
+    }
+  ]
+}
diff --git a/tests/snapshots/text__it_can_parse_document_with_multiple_text_elements.snap b/tests/snapshots/text__it_can_parse_document_with_multiple_text_elements.snap
@@ -5,11 +5,11 @@ expression: dom
 {
   "treeType": "documentFragment",
   "children": [
-    "hello world\nhere's another line for you!",
+    "hello world\nhere's another line for you!\n",
     {
       "name": "div",
       "variant": "void"
     },
-    "The end"
+    "\nThe end\n"
   ]
 }
diff --git a/tests/snapshots/text__it_can_parse_document_with_text_and_line_breaks.snap b/tests/snapshots/text__it_can_parse_document_with_text_and_line_breaks.snap
@@ -5,6 +5,6 @@ expression: dom
 {
   "treeType": "documentFragment",
   "children": [
-    "hello world\nhere's another line for you!\nThe end"
+    "hello world\nhere's another line for you!\nThe end\n"
   ]
 }
diff --git a/tests/snapshots/text__it_can_parse_text_in_paragraph_with_weird_formatting.snap b/tests/snapshots/text__it_can_parse_text_in_paragraph_with_weird_formatting.snap
@@ -0,0 +1,40 @@
+---
+source: tests/text.rs
+expression: dom
+---
+{
+  "treeType": "documentFragment",
+  "children": [
+    {
+      "name": "p",
+      "variant": "normal",
+      "children": [
+        "\n    This is a ",
+        {
+          "name": "b",
+          "variant": "normal",
+          "children": [
+            "para"
+          ]
+        },
+        "gra",
+        {
+          "name": "b",
+          "variant": "normal",
+          "children": [
+            "ph"
+          ]
+        },
+        " with some",
+        {
+          "name": "i",
+          "variant": "normal",
+          "children": [
+            " weird "
+          ]
+        },
+        " formatting.\n"
+      ]
+    }
+  ]
+}
diff --git a/tests/text.rs b/tests/text.rs
@@ -46,3 +46,15 @@ fn it_can_parse_text_with_chevron() -> Result<()> {
     assert_json_snapshot!(dom);
     Ok(())
 }
+
+#[test]
+fn it_can_parse_text_in_paragraph_with_weird_formatting() -> Result<()> {
+    let html = indoc!(r"
+        <p>
+            This is a <b>para</b>gra<b>ph</b> with some<i> weird </i> formatting.
+        </p>
+    ");
+    let dom = Dom::parse(html)?;
+    assert_json_snapshot!(dom);
+    Ok(())
+}