In [1]:
#default_exp mdx

# Custom Preprocessors For MDX

> Custom preprocessors that help convert notebook content into MDX

In [2]:
# export
from nbconvert.preprocessors import Preprocessor
from nbconvert import MarkdownExporter
from nbconvert.preprocessors import TagRemovePreprocessor
from nbdev.imports import get_config
import traitlets
from IPython.display import display, Markdown
from traitlets.config import Config
from pathlib import Path
import re

In [3]:
#hide
__file__ = str(get_config().path("lib_path")/'preproc.py')

In [4]:
#hide
def run_preprocessor(pp, nbfile, template_file='ob.tpl', display_results=False):
    c = Config()
    c.MarkdownExporter.preprocessors = pp
    tmp_dir = Path(__file__).parent/'templates/'
    tmp_file = tmp_dir/f"{template_file}"
    c.MarkdownExporter.template_file = str(tmp_file)
    exp =  MarkdownExporter(config=c)
    result = exp.from_filename(nbfile)
    if display_results: print(result[0])
    return result

In [5]:
#hide
def show_plain_md(nbfile):
    md = MarkdownExporter()
    print(md.from_filename(nbfile)[0])

In [6]:
#export
class CleanOutput(Preprocessor):
    """Remove the preamble from Metaflow output."""
    pattern = r'([\s\S]*Metaflow[\s\S]*Validating[\s\S]+The graph[\s\S]+)(\n[\s\S]+Workflow starting[\s\S]+)'
    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    
    def preprocess_cell(self, cell, resources, index):
        if re.search('\s*python.+run.*', cell.source) and 'outputs' in cell:
            for o in cell.outputs:
                if o.name == 'stdout':
                    o['text'] = self.ansi_escape.sub('', re.sub(self.pattern, r'\2', o.text)).strip()
        return cell, resources

In [7]:
#export
class WriteTitle(Preprocessor):
    """Modify the code-fence with the filename upon %%writefile cell magic."""
    pattern = r'(^[\S\s]*%%writefile\s)(\S+)\n'
    
    def preprocess_cell(self, cell, resources, index):
        m = re.match(self.pattern, cell.source)
        if m: 
            filename = m.group(2)
            ext = filename.split('.')[-1]
            cell.metadata.magics_language = f'{ext} title="{filename}"'
            cell.outputs = []
        return cell, resources

`WriteTitle` creates the proper code-fence with a title in the situation where the `%%writefile` magic is used.

For example, here are contents before pre-processing:

In [8]:
show_plain_md('test_files/writefile.ipynb')

A test notebook


```python
%%writefile myflow.py

from metaflow import FlowSpec, step

class MyFlow(FlowSpec):
    
    @step
    def start(self):
        print('this is the start')
        self.next(self.end)
    
    @step
    def end(self):
        print('this is the end')

if __name__ == '__main__':
    MyFlow()
```

    Overwriting myflow.py



```python
%%writefile hello.txt

Hello World
```

    Overwriting hello.txt




When we use `WriteTitle`, you will see the code-fence will change appropriately:

In [9]:
c, _ = run_preprocessor([WriteTitle], 'test_files/writefile.ipynb', display_results=True)
assert c.split('\n')[3] == '```py title="myflow.py"'
assert c.split('\n')[24] == '```txt title="hello.txt"'

A test notebook


```py title="myflow.py"
%%writefile myflow.py

from metaflow import FlowSpec, step

class MyFlow(FlowSpec):
    
    @step
    def start(self):
        print('this is the start')
        self.next(self.end)
    
    @step
    def end(self):
        print('this is the end')

if __name__ == '__main__':
    MyFlow()
```


```txt title="hello.txt"
%%writefile hello.txt

Hello World
```



In [10]:
#export
class CleanMagics(Preprocessor):
    """A preprocessor to remove cell magic commands"""
    pattern = '^\s*(%%|%).+?[\n\r]'
    
    def preprocess_cell(self, cell, resources, index):
        if cell.cell_type == 'code': 
            cell.source = re.sub(self.pattern, '', cell.source).strip()
        return cell, resources

In [11]:
#export
class BashIdentify(Preprocessor):
    """A preprocessor to identify bash commands and mark them appropriately"""
    pattern = '^\s*!'
    
    def preprocess_cell(self, cell, resources, index):
        if cell.cell_type == 'code' and re.search(self.pattern, cell.source):
            cell.metadata.magics_language = 'bash'
            cell.source = re.sub(self.pattern, '', cell.source).strip()
        return cell, resources

In [12]:
#export
def get_exporter(template_file='ob.tpl'):
    c = Config()
    c.TagRemovePreprocessor.remove_cell_tags = ("remove_cell",)
    c.TagRemovePreprocessor.remove_all_outputs_tags = ('remove_output',)
    c.TagRemovePreprocessor.remove_input_tags = ('remove_input',)
    c.TagRemovePreprocessor.enabled = False
    c.MarkdownExporter.preprocessors = [WriteTitle, CleanMagics, BashIdentify, CleanOutput]
    tmp_dir = Path(__file__).parent/'templates/'
    tmp_file = tmp_dir/f"{template_file}"
    if not tmp_file.exists(): raise ValueError(f"{tmp_file} does not exist in {tmp_dir}")
    c.MarkdownExporter.template_file = str(tmp_file)
    return MarkdownExporter(config=c)

In [13]:
hamel = get_exporter()

In [14]:
show_plain_md('test_files/example_input.ipynb')

---
title: my hello page title
description: my hello page description
hide_table_of_contents: true
---

```python
! echo hello
```

    hello



```python
%%writefile myflow.py

from metaflow import FlowSpec, step

class MyFlow(FlowSpec):
    
    @step
    def start(self):
        print('this is the start')
        self.next(self.end)
    
    @step
    def end(self):
        print('this is the end')

if __name__ == '__main__':
    MyFlow()
```

    Overwriting myflow.py



```python
! python myflow.py run
```

    [35m[1mMetaflow 2.5.0.post6+git62f5e52[0m[35m[22m executing [0m[31m[1mMyFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:hamel[0m[35m[22m[K[0m[35m[22m[0m
    [35m[22mValidating your flow...[K[0m[35m[22m[0m
    [32m[1m    The graph looks good![K[0m[32m[1m[0m
    [35m[22mRunning pylint...[K[0m[35m[22m[0m
    [32m[1m    Pylint is happy![K[0m[32m[1m[0m
    [35m2022-02-14 10:39:55.240 [0m[1mWorkflow starting (run-id 1644863995

In [15]:
print(hamel.from_filename('test_files/example_input.ipynb')[0])

---
title: my hello page title
description: my hello page description
hide_table_of_contents: true
---

```bash
echo hello
```

    hello



```py title="myflow.py"
from metaflow import FlowSpec, step

class MyFlow(FlowSpec):
    
    @step
    def start(self):
        print('this is the start')
        self.next(self.end)
    
    @step
    def end(self):
        print('this is the end')

if __name__ == '__main__':
    MyFlow()
```


```bash
python myflow.py run
```

    2022-02-14 10:39:55.240 Workflow starting (run-id 1644863995236850):
    2022-02-14 10:39:55.247 [1644863995236850/start/1 (pid 97470)] Task is starting.
    2022-02-14 10:39:55.840 [1644863995236850/start/1 (pid 97470)] this is the start
    2022-02-14 10:39:55.911 [1644863995236850/start/1 (pid 97470)] Task finished successfully.
    2022-02-14 10:39:55.919 [1644863995236850/end/2 (pid 97476)] Task is starting.
    2022-02-14 10:39:56.519 [1644863995236850/end/2 (pid 97476)] this is the end
    2022-02-14 10:39:56.5