Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: convert and export from markdown #1296

Merged
merged 15 commits into from
May 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 2 additions & 24 deletions marimo/_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import marimo._cli.cli_validators as validators
from marimo import __version__, _loggers
from marimo._ast import codegen
from marimo._cli import ipynb_to_marimo
from marimo._cli.convert.commands import convert
from marimo._cli.envinfo import get_system_info
from marimo._cli.export.commands import export
from marimo._cli.file_path import validate_name
Expand Down Expand Up @@ -502,29 +502,6 @@ def tutorial(
)


@main.command()
@click.argument("ipynb", type=str, required=True)
def convert(ipynb: str) -> None:
"""Convert a Jupyter notebook to a marimo notebook.

The argument may be either a path to a local .ipynb file,
or an .ipynb file hosted on GitHub.

Example usage:

marimo convert your_nb.ipynb > your_nb.py

After conversion, you can open the notebook in the editor:

marimo edit your_nb.py

Since marimo is different from traditional notebooks, once in the editor,
you may need to fix errors like multiple definition errors or cycle
errors.
"""
print(ipynb_to_marimo.convert_from_path(ipynb))


@main.command()
def env() -> None:
"""Print out environment information for debugging purposes.
Expand All @@ -536,4 +513,5 @@ def env() -> None:
print(json.dumps(get_system_info(), indent=2))


main.command()(convert)
main.add_command(export)
50 changes: 50 additions & 0 deletions marimo/_cli/convert/commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2024 Marimo. All rights reserved.
from __future__ import annotations

from pathlib import Path

import click

from marimo._cli.convert.ipynb import convert_from_ipynb
from marimo._cli.convert.markdown import convert_from_md
from marimo._cli.convert.utils import load_external_file


@click.argument("filename", required=True)
def convert(
filename: str,
) -> None:
r"""Convert a Jupyter notebook or Markdown file to a marimo notebook.

The argument may be either a path to a local .ipynb/.md file,
dmadisetti marked this conversation as resolved.
Show resolved Hide resolved
or an .ipynb/.md file hosted on GitHub.

Example usage:

marimo convert your_nb.ipynb > your_nb.py

or

marimo convert your_nb.md > your_nb.py
dmadisetti marked this conversation as resolved.
Show resolved Hide resolved

Jupyter notebook conversion will strip out all outputs. Markdown cell
conversion with occur on the presence of `\`\`\`{python}` code blocks.
After conversion, you can open the notebook in the editor:

marimo edit your_nb.py

Since marimo is different from traditional notebooks, once in the editor,
you may need to fix errors like multiple definition errors or cycle
errors.
"""
ext = Path(filename).suffix
if ext not in (".ipynb", ".md", ".qmd"):
raise click.UsageError("File must be an .ipynb or .md file")

text = load_external_file(filename, ext)
if ext == ".ipynb":
notebook = convert_from_ipynb(text)
else:
assert ext in (".md", ".qmd")
notebook = convert_from_md(text)
click.echo(notebook)
36 changes: 36 additions & 0 deletions marimo/_cli/convert/ipynb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2024 Marimo. All rights reserved.
from __future__ import annotations

import json

from marimo._cli.convert.utils import (
generate_from_sources,
load_external_file,
markdown_to_marimo,
)


def convert_from_ipynb(raw_notebook: str) -> str:
notebook = json.loads(raw_notebook)
sources = []
has_markdown = False
for cell in notebook["cells"]:
source = (
cell["source"]
if isinstance(cell["source"], str)
else "".join(cell["source"])
)
if cell["cell_type"] == "markdown":
has_markdown = True
source = markdown_to_marimo(source)
sources.append(source)

if has_markdown:
sources.append("import marimo as mo")

return generate_from_sources(sources)


def convert_from_ipynb_file(file_path: str) -> str:
raw_notebook = load_external_file(file_path, "ipynb")
return convert_from_ipynb(raw_notebook)
218 changes: 218 additions & 0 deletions marimo/_cli/convert/markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
# Copyright 2024 Marimo. All rights reserved.
from __future__ import annotations

import re
from typing import Any, Callable, Literal

# Native to python
from xml.etree.ElementTree import Element, SubElement

# Note: yaml is also a python builtin
import yaml

# Markdown is a dependency of marimo, as such we utilize it as much as possible
# to parse markdown.
from markdown import Markdown
from markdown.blockparser import BlockParser
from markdown.blockprocessors import BlockProcessor
from markdown.preprocessors import Preprocessor
from markdown.util import HTML_PLACEHOLDER_RE, Registry

# As are extensions
from pymdownx.superfences import SuperFencesCodeExtension # type: ignore

from marimo._ast.app import _AppConfig
from marimo._cli.convert.utils import generate_from_sources, markdown_to_marimo

# CSafeLoader is faster than SafeLoader.
try:
from yaml import CSafeLoader as SafeLoader
except ImportError:
from yaml import SafeLoader # type: ignore[assignment]

MARIMO_MD = "marimo-md"
MARIMO_CODE = "marimo-code"


def _is_code_tag(text: str) -> bool:
head = text.split("\n")[0].strip()
return bool(re.search(r"\{.*python.*\}", head))


def _tree_to_app(root: Element) -> str:
# Extract meta data from root attributes.
config_keys = {"title": "app_title"}
config = {
config_keys[key]: value
for key, value in root.items()
if key in config_keys
}
app_config = _AppConfig.from_untrusted_dict(config)

sources = []
for child in root:
source = child.text
if not (source and source.strip()):
continue
if child.tag == MARIMO_MD:
source = markdown_to_marimo(source)
else:
assert child.tag == MARIMO_CODE, f"Unknown tag: {child.tag}"
sources.append(source)

return generate_from_sources(sources, app_config)


class MarimoParser(Markdown):
"""Parses Markdown to marimo notebook."""

# Considering how ubiquitous "markdown" is, it's a little surprising the
# internal structure isn't cleaner/ more modular. This "monkey-patching"
# is comparable to some of the code in markdown extensions- and given this
# library has been around since 2004, the internals should be relatively
# stable.
output_formats: dict[Literal["marimo"], Callable[[Element], str]] = { # type: ignore[assignment, misc]
"marimo": _tree_to_app,
}
meta: dict[str, Any]

def build_parser(self) -> MarimoParser:
"""
Creates blank registries as a base.

Note that evoked by itself, will create an infinite loop, since
block-parsers will never dequeue the extracted blocks.
"""
self.preprocessors = Registry()
self.parser = BlockParser(self)
self.inlinePatterns = Registry()
self.treeprocessors = Registry()
self.postprocessors = Registry()
return self


class FrontMatterPreprocessor(Preprocessor):
"""Preprocessor for to extract YAML front matter.

The built-in MetaPreprocessor does not handle frontmatter yaml properly, so
this is a custom implementation.

Like the built-in MetaPreprocessor, this preprocessor extracts yaml and
stores it in the Markdown's metadata attribute. Inspired by conversation
and linked project in github/Python-Markdown/markdown/497. See docdown
(BSD-3) or python-frontmatter (MIT) for similar implementations.
"""

def __init__(self, md: MarimoParser):
super().__init__(md)
self.md = md
self.md.meta = {}
# Regex captures loose yaml for frontmatter
# Should match the following:
# ---
# title: "Title"
# whatever
# ---
self.yaml_front_matter_regex = re.compile(
r"^---\s*\n(.*?\n?)(?:---)\s*\n", re.UNICODE | re.DOTALL
)

def run(self, lines: list[str]) -> list[str]:
if not lines:
return lines

doc = "\n".join(lines)
result = self.yaml_front_matter_regex.match(doc)

if result:
yaml_content = result.group(1)
try:
meta = yaml.load(yaml_content, SafeLoader)
if isinstance(meta, dict):
self.md.meta = meta # type: ignore[attr-defined]
doc = doc[result.end() :].lstrip("\n")
# If there's an error in parsing YAML, ignore the meta and proceed.
except yaml.YAMLError as e:
raise e
return doc.split("\n")


class ExpandAndClassifyProcessor(BlockProcessor):
"""Separates code blocks and markdown blocks."""

stash: dict[str, Any]

def test(*_args: Any) -> bool:
return True

def run(self, parent: Element, blocks: list[str]) -> None:
# Copy app metadata to the parent element.
for key, value in self.parser.md.meta.items(): # type: ignore[attr-defined]
if isinstance(value, str):
parent.set(key, value)

text: list[str] = []

def add_paragraph() -> None:
if not text:
return
# An additional line break is added before code blocks.
if text[-1].strip() == "":
text.pop()
if not text:
return
paragraph = SubElement(parent, MARIMO_MD)
paragraph.text = "\n".join(text).strip()
text.clear()

# Operate on line basis, not block basis, but use block processor
# instead of preprocessor, because we still want to operate on the
# xml tree.
for line in "\n\n".join(blocks).split("\n"):
# Superfences replaces code blocks with a placeholder,
# Check for the placeholder, and ensure it is a marimo code block,
# otherwise consider it as markdown.
if HTML_PLACEHOLDER_RE.match(line.strip()):
lookup = line.strip()[1:-1]
code = self.stash[lookup][0]
if _is_code_tag(code):
add_paragraph()
code_block = SubElement(parent, MARIMO_CODE)
code_block.text = "\n".join(code.split("\n")[1:-1])
else:
text.extend(code.split("\n"))
else:
text.append(line)
add_paragraph()
# Flush to indicate all blocks have been processed.
blocks.clear()


def _build_marimo_parser() -> MarimoParser:
# Build here opposed to the parent class since there is intermediate logic
# after the parser is built, and it is more clear here what is registered.
md = MarimoParser(output_format="marimo") # type: ignore[arg-type]
md.stripTopLevelTags = False

# Note: MetaPreprocessor does not properly handle frontmatter yaml, so
# cleanup occurs in the block-processor.
md.preprocessors.register(FrontMatterPreprocessor(md), "frontmatter", 100)
fences_ext = SuperFencesCodeExtension()
fences_ext.extendMarkdown(md)
# TODO: Consider adding the admonition extension, and integrating it with
# mo.markdown callouts.

block_processors = ExpandAndClassifyProcessor(md.parser)
block_processors.stash = fences_ext.stash.stash
md.parser.blockprocessors.register(
block_processors, "marimo-processor", 10
)

return md


def convert_from_md(text: str) -> str:
if not text:
raise ValueError("No content found in markdown.")
md = _build_marimo_parser()
return md.convert(text)
Loading
Loading