Skip to content

Commit

Permalink
fix: add markdown sanitization for malicious notebooks (#1315)
Browse files Browse the repository at this point in the history
* fix: add markdown sanitization for malicious notebooks

* *sufficient

* fix: check for code block by regex
  • Loading branch information
dmadisetti authored and mscolnick committed May 5, 2024
1 parent ddc0c52 commit e1d5b14
Show file tree
Hide file tree
Showing 8 changed files with 566 additions and 51 deletions.
177 changes: 143 additions & 34 deletions marimo/_cli/convert/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
from markdown.util import HTML_PLACEHOLDER_RE, Registry

# As are extensions
from pymdownx.superfences import SuperFencesCodeExtension # type: ignore
from pymdownx.superfences import ( # type: ignore
RE_NESTED_FENCE_START,
SuperFencesCodeExtension,
)

from marimo._ast.app import _AppConfig
from marimo._cli.convert.utils import generate_from_sources, markdown_to_marimo
Expand All @@ -39,6 +42,13 @@ def _is_code_tag(text: str) -> bool:
return bool(re.search(r"\{.*python.*\}", head))


def formatted_code_block(code: str) -> str:
guard = "```"
while guard in code:
guard += "`"
return "\n".join([f"""{guard}{{.python.marimo}}""", code, guard, ""])


def _tree_to_app(root: Element) -> str:
# Extract meta data from root attributes.
config_keys = {"title": "app_title"}
Expand All @@ -63,25 +73,21 @@ def _tree_to_app(root: Element) -> str:
return generate_from_sources(sources, app_config)


class MarimoParser(Markdown):
"""Parses Markdown to marimo notebook."""
class IdentityParser(Markdown):
"""Leaves markdown unchanged."""

# Considering how ubiquitous "markdown" is, it's a little surprising the
# internal structure isn't cleaner/ more modular. This "monkey-patching"
# is comparable to some of the code in markdown extensions- and given this
# library has been around since 2004, the internals should be relatively
# stable.
output_formats: dict[Literal["marimo"], Callable[[Element], str]] = { # type: ignore[assignment, misc]
"marimo": _tree_to_app,
output_formats: dict[Literal["identity"], Callable[[Element], str]] = { # type: ignore[assignment, misc]
"identity": lambda x: x.text if x.text else "",
}
meta: dict[str, Any]

def build_parser(self) -> MarimoParser:
def build_parser(self) -> IdentityParser:
"""
Creates blank registries as a base.
Note that evoked by itself, will create an infinite loop, since
block-parsers will never dequeue the extracted blocks.
"""
self.preprocessors = Registry()
self.parser = BlockParser(self)
Expand All @@ -90,6 +96,76 @@ def build_parser(self) -> MarimoParser:
self.postprocessors = Registry()
return self

def convert(self, text: str) -> str:
"""Override the convert method to return the parsed text.
Note that evoked by itself, would create an infinite loop, since
block-parsers will never dequeue the extracted blocks.
"""
if len(self.parser.blockprocessors) == 0:
self.parser.blockprocessors.register(
IdentityProcessor(self.parser), "identity", 1
)

return super().convert(text)


class MarimoParser(IdentityParser):
"""Parses Markdown to marimo notebook string."""

meta: dict[str, Any]

output_formats: dict[Literal["marimo"], Callable[[Element], str]] = { # type: ignore[assignment, misc]
"marimo": _tree_to_app,
}

def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
# Build here opposed to the parent class since there is intermediate
# logic after the parser is built, and it is more clear here what is
# registered.
self.stripTopLevelTags = False

# Note: MetaPreprocessor does not properly handle frontmatter yaml, so
# cleanup occurs in the block-processor.
self.preprocessors.register(
FrontMatterPreprocessor(self), "frontmatter", 100
)
fences_ext = SuperFencesCodeExtension()
fences_ext.extendMarkdown(self)
# TODO: Consider adding the admonition extension, and integrating it
# with mo.markdown callouts.

block_processor = ExpandAndClassifyProcessor(self.parser)
block_processor.stash = fences_ext.stash.stash
self.parser.blockprocessors.register(
block_processor, "marimo-processor", 10
)


class SanitizeParser(IdentityParser):
"""Sanitizes Markdown to non-executable string."""

def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
# Potentially no need for a separate sanitizer. We could use a
# heuristic to determine if this block should be treated as code, but
# to catch all edgecases, just run it through the similar superfence
# logic.
self.stripTopLevelTags = False

fences_ext = SuperFencesCodeExtension()
fences_ext.extendMarkdown(self)

preprocessor = SanitizeProcessor(self)
preprocessor.stash = fences_ext.stash.stash
self.preprocessors.register(preprocessor, "marimo-sanitizer", 1)

# Add back in identity to dequeue.
self.parser.blockprocessors.register(
IdentityProcessor(self.parser), "identity", 1
)


class FrontMatterPreprocessor(Preprocessor):
"""Preprocessor for to extract YAML front matter.
Expand Down Expand Up @@ -137,6 +213,55 @@ def run(self, lines: list[str]) -> list[str]:
return doc.split("\n")


class SanitizeProcessor(Preprocessor):
"""Prevent unintended executable code block injection.
Typically run on Markdown fragments (e.g. cells) to prevent code injection.
**Note***: Must run after SuperFencesCodeExtension.
"""

stash: dict[str, Any]

def run(self, lines: list[str]) -> list[str]:
# Note, an empty stash is not sufficient since partially open code
# blocks could be in the text.
if not lines:
return lines

is_code = False
for i, line in enumerate(lines):
# Still need to do all replacements
if HTML_PLACEHOLDER_RE.match(line.strip()):
lookup = line.strip()[1:-1]
code = self.stash[lookup][0]
lines[i] = code
# This is a tag we would normally parse on.
# So protect it from being parsed improperly, by just treating
# it as code.
is_code = is_code or _is_code_tag(code)
# We also need to check for code block delimiters that superfences
# did not catch, as this will break other code blocks.
is_code = is_code or RE_NESTED_FENCE_START.match(line)

if not is_code:
return lines

return formatted_code_block(
markdown_to_marimo("\n".join(lines))
).split("\n")


class IdentityProcessor(BlockProcessor):
"""Leaves markdown unchanged."""

def test(*_args: Any) -> bool:
return True

def run(self, parent: Element, blocks: list[str]) -> None:
parent.text = "\n\n".join(blocks)
blocks.clear()


class ExpandAndClassifyProcessor(BlockProcessor):
"""Separates code blocks and markdown blocks."""

Expand Down Expand Up @@ -188,31 +313,15 @@ def add_paragraph() -> None:
blocks.clear()


def _build_marimo_parser() -> MarimoParser:
# Build here opposed to the parent class since there is intermediate logic
# after the parser is built, and it is more clear here what is registered.
md = MarimoParser(output_format="marimo") # type: ignore[arg-type]
md.stripTopLevelTags = False

# Note: MetaPreprocessor does not properly handle frontmatter yaml, so
# cleanup occurs in the block-processor.
md.preprocessors.register(FrontMatterPreprocessor(md), "frontmatter", 100)
fences_ext = SuperFencesCodeExtension()
fences_ext.extendMarkdown(md)
# TODO: Consider adding the admonition extension, and integrating it with
# mo.markdown callouts.
def convert_from_md(text: str) -> str:
return MarimoParser(output_format="marimo").convert(text) # type: ignore[arg-type]

block_processors = ExpandAndClassifyProcessor(md.parser)
block_processors.stash = fences_ext.stash.stash
md.parser.blockprocessors.register(
block_processors, "marimo-processor", 10
)

return md
def sanitize_markdown(text: str) -> str:
return SanitizeParser(output_format="identity").convert(text) # type: ignore[arg-type]


def convert_from_md(text: str) -> str:
if not text:
raise ValueError("No content found in markdown.")
md = _build_marimo_parser()
return md.convert(text)
def is_sanitized_markdown(text: str) -> bool:
# "Unsanitized" markdown contains potentially unintended executatable code
# block, which require backticks.
return "```" not in text or sanitize_markdown(text) == text
24 changes: 12 additions & 12 deletions marimo/_server/export/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
from typing import cast

from marimo import __version__
from marimo._cli.convert.markdown import (
formatted_code_block,
is_sanitized_markdown,
)
from marimo._config.config import (
DEFAULT_CONFIG,
DisplayConfig,
Expand Down Expand Up @@ -148,20 +152,16 @@ def export_as_md(self, file_manager: AppFileManager) -> tuple[str, str]:
code = cell_data.code
if cell:
markdown = get_markdown_from_cell(cell, code)
if markdown:
# Unsanitized markdown is forced to code.
if markdown and is_sanitized_markdown(markdown):
previous_was_markdown = True
document.append(markdown)
else:
# Add a blank line between markdown and code
if previous_was_markdown:
document.append("")
previous_was_markdown = False
guard = "```"
while guard in code:
guard += "`"
document.extend(
[f"""{guard}{{.python.marimo}}""", code, guard, ""]
)
continue
# Add a blank line between markdown and code
if previous_was_markdown:
document.append("")
previous_was_markdown = False
document.append(formatted_code_block(code))

download_filename = get_download_filename(file_manager, ".md")
return "\n".join(document).strip(), download_filename
3 changes: 2 additions & 1 deletion marimo/_server/export/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,18 @@ def get_markdown_from_cell(
else:
return None
assert value.func.value.id == "mo"
md_lines = _const_string(value.args).split("\n")
except (AssertionError, AttributeError, ValueError):
# No reason to explicitly catch exceptions if we can't parse out
# markdown. Just handle it as a code block.
return None

# Dedent behavior is a little different that in marimo js, so handle
# accordingly.
md_lines = _const_string(value.args).split("\n")
md_lines = [line.rstrip() for line in md_lines]
md = dedent(md_lines[0]) + "\n" + dedent("\n".join(md_lines[1:]))
md = md.strip()

if callout:
md = dedent(
f"""
Expand Down
68 changes: 68 additions & 0 deletions tests/_cli/snapshots/unsafe-app.md.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
---
title: Test Notebook
marimo-version: 0.0.0
---

```{.python.marimo}
import marimo as mo
```

````{.python.marimo}
mo.md("""
# Code blocks in code blocks
Output code for Hello World!
```python
print("Hello World")
```
Execute print
```{python}
print("Hello World")
```
""")
````

````{.python.marimo}
mo.md(f"""
with f-string too!
```{{python}}
print("Hello World")
```
""")
````

````{.python.marimo}
mo.md(f"""
Not markdown
```{{python}}
print("1 + 1 = {1 + 1}")
```
""")
````

Nested fence
````text
The guards are
```{python}
````

````{.python.marimo}
"""
```
"""
````

````{.python.marimo}
mo.md("""
Cross cell injection
```python
""")
````

```{.python.marimo}
1 + 1
```

```{.python.marimo}
# Actual print
print("Hello World")
```
Loading

0 comments on commit e1d5b14

Please sign in to comment.