fix: add markdown sanitization for malicious notebooks (#1315)

* fix: add markdown sanitization for malicious notebooks * *sufficient * fix: check for code block by regex
marimo-team · May 5, 2024 · e1d5b14 · e1d5b14
1 parent ddc0c52
commit e1d5b14
Show file tree

Hide file tree

Showing 8 changed files with 566 additions and 51 deletions.
diff --git a/marimo/_cli/convert/markdown.py b/marimo/_cli/convert/markdown.py
@@ -19,7 +19,10 @@
 from markdown.util import HTML_PLACEHOLDER_RE, Registry
 
 # As are extensions
-from pymdownx.superfences import SuperFencesCodeExtension  # type: ignore
+from pymdownx.superfences import (  # type: ignore
+    RE_NESTED_FENCE_START,
+    SuperFencesCodeExtension,
+)
 
 from marimo._ast.app import _AppConfig
 from marimo._cli.convert.utils import generate_from_sources, markdown_to_marimo
@@ -39,6 +42,13 @@ def _is_code_tag(text: str) -> bool:
     return bool(re.search(r"\{.*python.*\}", head))
 
 
+def formatted_code_block(code: str) -> str:
+    guard = "```"
+    while guard in code:
+        guard += "`"
+    return "\n".join([f"""{guard}{{.python.marimo}}""", code, guard, ""])
+
+
 def _tree_to_app(root: Element) -> str:
     # Extract meta data from root attributes.
     config_keys = {"title": "app_title"}
@@ -63,25 +73,21 @@ def _tree_to_app(root: Element) -> str:
     return generate_from_sources(sources, app_config)
 
 
-class MarimoParser(Markdown):
-    """Parses Markdown to marimo notebook."""
+class IdentityParser(Markdown):
+    """Leaves markdown unchanged."""
 
     # Considering how ubiquitous "markdown" is, it's a little surprising the
     # internal structure isn't cleaner/ more modular. This "monkey-patching"
     # is comparable to some of the code in markdown extensions- and given this
     # library has been around since 2004, the internals should be relatively
     # stable.
-    output_formats: dict[Literal["marimo"], Callable[[Element], str]] = {  # type: ignore[assignment, misc]
-        "marimo": _tree_to_app,
+    output_formats: dict[Literal["identity"], Callable[[Element], str]] = {  # type: ignore[assignment, misc]
+        "identity": lambda x: x.text if x.text else "",
     }
-    meta: dict[str, Any]
 
-    def build_parser(self) -> MarimoParser:
+    def build_parser(self) -> IdentityParser:
         """
         Creates blank registries as a base.
-
-        Note that evoked by itself, will create an infinite loop, since
-        block-parsers will never dequeue the extracted blocks.
         """
         self.preprocessors = Registry()
         self.parser = BlockParser(self)
@@ -90,6 +96,76 @@ def build_parser(self) -> MarimoParser:
         self.postprocessors = Registry()
         return self
 
+    def convert(self, text: str) -> str:
+        """Override the convert method to return the parsed text.
+
+        Note that evoked by itself, would create an infinite loop, since
+        block-parsers will never dequeue the extracted blocks.
+        """
+        if len(self.parser.blockprocessors) == 0:
+            self.parser.blockprocessors.register(
+                IdentityProcessor(self.parser), "identity", 1
+            )
+
+        return super().convert(text)
+
+
+class MarimoParser(IdentityParser):
+    """Parses Markdown to marimo notebook string."""
+
+    meta: dict[str, Any]
+
+    output_formats: dict[Literal["marimo"], Callable[[Element], str]] = {  # type: ignore[assignment, misc]
+        "marimo": _tree_to_app,
+    }
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        # Build here opposed to the parent class since there is intermediate
+        # logic after the parser is built, and it is more clear here what is
+        # registered.
+        self.stripTopLevelTags = False
+
+        # Note: MetaPreprocessor does not properly handle frontmatter yaml, so
+        # cleanup occurs in the block-processor.
+        self.preprocessors.register(
+            FrontMatterPreprocessor(self), "frontmatter", 100
+        )
+        fences_ext = SuperFencesCodeExtension()
+        fences_ext.extendMarkdown(self)
+        # TODO: Consider adding the admonition extension, and integrating it
+        # with mo.markdown callouts.
+
+        block_processor = ExpandAndClassifyProcessor(self.parser)
+        block_processor.stash = fences_ext.stash.stash
+        self.parser.blockprocessors.register(
+            block_processor, "marimo-processor", 10
+        )
+
+
+class SanitizeParser(IdentityParser):
+    """Sanitizes Markdown to non-executable string."""
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        # Potentially no need for a separate sanitizer. We could use a
+        # heuristic to determine if this block should be treated as code, but
+        # to catch all edgecases, just run it through the similar superfence
+        # logic.
+        self.stripTopLevelTags = False
+
+        fences_ext = SuperFencesCodeExtension()
+        fences_ext.extendMarkdown(self)
+
+        preprocessor = SanitizeProcessor(self)
+        preprocessor.stash = fences_ext.stash.stash
+        self.preprocessors.register(preprocessor, "marimo-sanitizer", 1)
+
+        # Add back in identity to dequeue.
+        self.parser.blockprocessors.register(
+            IdentityProcessor(self.parser), "identity", 1
+        )
+
 
 class FrontMatterPreprocessor(Preprocessor):
     """Preprocessor for to extract YAML front matter.
@@ -137,6 +213,55 @@ def run(self, lines: list[str]) -> list[str]:
         return doc.split("\n")
 
 
+class SanitizeProcessor(Preprocessor):
+    """Prevent unintended executable code block injection.
+
+    Typically run on Markdown fragments (e.g. cells) to prevent code injection.
+    **Note***: Must run after SuperFencesCodeExtension.
+    """
+
+    stash: dict[str, Any]
+
+    def run(self, lines: list[str]) -> list[str]:
+        # Note, an empty stash is not sufficient since partially open code
+        # blocks could be in the text.
+        if not lines:
+            return lines
+
+        is_code = False
+        for i, line in enumerate(lines):
+            # Still need to do all replacements
+            if HTML_PLACEHOLDER_RE.match(line.strip()):
+                lookup = line.strip()[1:-1]
+                code = self.stash[lookup][0]
+                lines[i] = code
+                # This is a tag we would normally parse on.
+                # So protect it from being parsed improperly, by just treating
+                # it as code.
+                is_code = is_code or _is_code_tag(code)
+            # We also need to check for code block delimiters that superfences
+            # did not catch, as this will break other code blocks.
+            is_code = is_code or RE_NESTED_FENCE_START.match(line)
+
+        if not is_code:
+            return lines
+
+        return formatted_code_block(
+            markdown_to_marimo("\n".join(lines))
+        ).split("\n")
+
+
+class IdentityProcessor(BlockProcessor):
+    """Leaves markdown unchanged."""
+
+    def test(*_args: Any) -> bool:
+        return True
+
+    def run(self, parent: Element, blocks: list[str]) -> None:
+        parent.text = "\n\n".join(blocks)
+        blocks.clear()
+
+
 class ExpandAndClassifyProcessor(BlockProcessor):
     """Separates code blocks and markdown blocks."""
 
@@ -188,31 +313,15 @@ def add_paragraph() -> None:
         blocks.clear()
 
 
-def _build_marimo_parser() -> MarimoParser:
-    # Build here opposed to the parent class since there is intermediate logic
-    # after the parser is built, and it is more clear here what is registered.
-    md = MarimoParser(output_format="marimo")  # type: ignore[arg-type]
-    md.stripTopLevelTags = False
-
-    # Note: MetaPreprocessor does not properly handle frontmatter yaml, so
-    # cleanup occurs in the block-processor.
-    md.preprocessors.register(FrontMatterPreprocessor(md), "frontmatter", 100)
-    fences_ext = SuperFencesCodeExtension()
-    fences_ext.extendMarkdown(md)
-    # TODO: Consider adding the admonition extension, and integrating it with
-    # mo.markdown callouts.
+def convert_from_md(text: str) -> str:
+    return MarimoParser(output_format="marimo").convert(text)  # type: ignore[arg-type]
 
-    block_processors = ExpandAndClassifyProcessor(md.parser)
-    block_processors.stash = fences_ext.stash.stash
-    md.parser.blockprocessors.register(
-        block_processors, "marimo-processor", 10
-    )
 
-    return md
+def sanitize_markdown(text: str) -> str:
+    return SanitizeParser(output_format="identity").convert(text)  # type: ignore[arg-type]
 
 
-def convert_from_md(text: str) -> str:
-    if not text:
-        raise ValueError("No content found in markdown.")
-    md = _build_marimo_parser()
-    return md.convert(text)
+def is_sanitized_markdown(text: str) -> bool:
+    # "Unsanitized" markdown contains potentially unintended executatable code
+    # block, which require backticks.
+    return "```" not in text or sanitize_markdown(text) == text
diff --git a/marimo/_server/export/exporter.py b/marimo/_server/export/exporter.py
@@ -7,6 +7,10 @@
 from typing import cast
 
 from marimo import __version__
+from marimo._cli.convert.markdown import (
+    formatted_code_block,
+    is_sanitized_markdown,
+)
 from marimo._config.config import (
     DEFAULT_CONFIG,
     DisplayConfig,
@@ -148,20 +152,16 @@ def export_as_md(self, file_manager: AppFileManager) -> tuple[str, str]:
             code = cell_data.code
             if cell:
                 markdown = get_markdown_from_cell(cell, code)
-                if markdown:
+                # Unsanitized markdown is forced to code.
+                if markdown and is_sanitized_markdown(markdown):
                     previous_was_markdown = True
                     document.append(markdown)
-                else:
-                    # Add a blank line between markdown and code
-                    if previous_was_markdown:
-                        document.append("")
-                    previous_was_markdown = False
-                    guard = "```"
-                    while guard in code:
-                        guard += "`"
-                    document.extend(
-                        [f"""{guard}{{.python.marimo}}""", code, guard, ""]
-                    )
+                    continue
+                # Add a blank line between markdown and code
+                if previous_was_markdown:
+                    document.append("")
+                previous_was_markdown = False
+                document.append(formatted_code_block(code))
 
         download_filename = get_download_filename(file_manager, ".md")
         return "\n".join(document).strip(), download_filename
diff --git a/marimo/_server/export/utils.py b/marimo/_server/export/utils.py
@@ -81,17 +81,18 @@ def get_markdown_from_cell(
         else:
             return None
         assert value.func.value.id == "mo"
+        md_lines = _const_string(value.args).split("\n")
     except (AssertionError, AttributeError, ValueError):
         # No reason to explicitly catch exceptions if we can't parse out
         # markdown. Just handle it as a code block.
         return None
 
     # Dedent behavior is a little different that in marimo js, so handle
     # accordingly.
-    md_lines = _const_string(value.args).split("\n")
     md_lines = [line.rstrip() for line in md_lines]
     md = dedent(md_lines[0]) + "\n" + dedent("\n".join(md_lines[1:]))
     md = md.strip()
+
     if callout:
         md = dedent(
             f"""

diff --git a/tests/_cli/snapshots/unsafe-app.md.txt b/tests/_cli/snapshots/unsafe-app.md.txt
@@ -0,0 +1,68 @@
+---
+title: Test Notebook
+marimo-version: 0.0.0
+---
+
+```{.python.marimo}
+import marimo as mo
+```
+
+````{.python.marimo}
+mo.md("""
+    # Code blocks in code blocks
+    Output code for Hello World!
+    ```python
+    print("Hello World")
+    ```
+    Execute print
+    ```{python}
+    print("Hello World")
+    ```
+""")
+````
+
+````{.python.marimo}
+mo.md(f"""
+    with f-string too!
+    ```{{python}}
+    print("Hello World")
+    ```
+""")
+````
+
+````{.python.marimo}
+mo.md(f"""
+    Not markdown
+    ```{{python}}
+    print("1 + 1 = {1 + 1}")
+    ```
+""")
+````
+
+Nested fence
+````text
+The guards are
+```{python}
+````
+
+````{.python.marimo}
+"""
+```
+"""
+````
+
+````{.python.marimo}
+mo.md("""
+    Cross cell injection
+    ```python
+""")
+````
+
+```{.python.marimo}
+1 + 1
+```
+
+```{.python.marimo}
+# Actual print
+print("Hello World")
+```