In [1]:
#default_exp mdx

# Custom Preprocessors For MDX

> Custom preprocessors that help convert notebook content into MDX

This module defines [nbconvert.Custom Preprocessors](https://nbconvert.readthedocs.io/en/latest/nbconvert_library.html#Custom-Preprocessors) that facilitate transforming notebook content into MDX, which is a variation of markdown.

In [2]:
# export
from nbconvert.preprocessors import Preprocessor
from nbconvert import MarkdownExporter, NotebookExporter
from nbconvert.preprocessors import TagRemovePreprocessor
from nbdev.imports import get_config
import traitlets
from IPython.display import display, Markdown
from traitlets.config import Config
from pathlib import Path
import re, os, json
from fastcore.all import L, flatten
from nbdev.export import read_nb

In [3]:
#hide
__file__ = str(get_config().path("lib_path")/'preproc.py')

In [4]:
#hide
def run_preprocessor(pp, nbfile, template_file='ob.tpl', display_results=False):
    c = Config()
    c.MarkdownExporter.preprocessors = pp
    tmp_dir = Path(__file__).parent/'templates/'
    tmp_file = tmp_dir/f"{template_file}"
    c.MarkdownExporter.template_file = str(tmp_file)
    exp =  MarkdownExporter(config=c)
    result = exp.from_filename(nbfile)
    if display_results: print(result[0])
    return result

In [5]:
#hide
def show_plain_md(nbfile):
    md = MarkdownExporter()
    print(md.from_filename(nbfile)[0])

In [6]:
#export
_re_meta = r'^\s*#cell_meta:\S+\s*[\n\r]'

In [7]:
#export
class InjectMeta(Preprocessor):
    """
    Allows you to inject metadata into a cell for further preprocessing with a comment.
    """
    pattern = r'(^\s*#cell_meta:)(\S+)(\s*[\n\r])'
    
    def preprocess_cell(self, cell, resources, index):
        if cell.cell_type == 'code' and re.search(_re_meta, cell.source):
            cell_meta = re.findall(self.pattern, cell.source, re.MULTILINE)
            d = cell.metadata.get('nbdoc', {})
            for _, m, _ in cell_meta:
                if '=' in m:
                    k,v = m.split('=')
                    d[k] = v
                else: print(f"Warning cell_meta:{m} does not have '=' will be ignored.")
            cell.metadata['nbdoc'] = d
        return cell, resources

To inject metadata make a comment in a cell with the following pattern: '#cell_meta:{key=value}'

For example, consider the following code:

In [8]:
_test_file = 'test_files/hello_world.ipynb'
first_cell = read_nb(_test_file)['cells'][0]
print(first_cell['source'])

#cell_meta:show_steps=start,train
print('hello world')


At the moment, this cell has no metadata:

In [9]:
print(first_cell['metadata'])

{}


However, after we process this notebook with `InjectMeta`, the appropriate metadata will be injected:

In [10]:
c = Config()
c.NotebookExporter.preprocessors = [InjectMeta]
exp = NotebookExporter(config=c)
cells, _ = exp.from_filename(_test_file)
first_cell = json.loads(cells)['cells'][0]

assert first_cell['metadata'] == {'nbdoc': {'show_steps': 'start,train'}}
first_cell['metadata']

{'nbdoc': {'show_steps': 'start,train'}}

In [11]:
#export
class MetaflowPreamble(Preprocessor):
    """Remove the preamble from Metaflow output."""
    pattern = r'([\s\S]*Metaflow[\s\S]*Validating[\s\S]+The graph[\s\S]+)(\n[\s\S]+Workflow starting[\s\S]+)'
    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    
    def preprocess_cell(self, cell, resources, index):
        if re.search('\s*python.+run.*', cell.source) and 'outputs' in cell:
            for o in cell.outputs:
                if o.name == 'stdout':
                    o['text'] = self.ansi_escape.sub('', re.sub(self.pattern, r'\2', o.text)).strip()
        return cell, resources

When you run a metaflow Flow, you are presented with a fair amount of boilerpalte before the job starts running that is not necesary to show in the documentation:

In [12]:
show_plain_md('test_files/run_flow.ipynb')

```python
!python myflow.py run
```

    [35m[1mMetaflow 2.5.0.post6+git62f5e52[0m[35m[22m executing [0m[31m[1mMyFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:hamel[0m[35m[22m[K[0m[35m[22m[0m
    [35m[22mValidating your flow...[K[0m[35m[22m[0m
    [32m[1m    The graph looks good![K[0m[32m[1m[0m
    [35m[22mRunning pylint...[K[0m[35m[22m[0m
    [32m[1m    Pylint is happy![K[0m[32m[1m[0m
    [35m2022-02-15 13:57:54.075 [0m[1mWorkflow starting (run-id 1644962274071949):[0m
    [35m2022-02-15 13:57:54.084 [0m[32m[1644962274071949/start/1 (pid 46643)] [0m[1mTask is starting.[0m
    [35m2022-02-15 13:57:54.703 [0m[32m[1644962274071949/start/1 (pid 46643)] [0m[22mthis is the start[0m
    [35m2022-02-15 13:57:54.771 [0m[32m[1644962274071949/start/1 (pid 46643)] [0m[1mTask finished successfully.[0m
    [35m2022-02-15 13:57:54.779 [0m[32m[1644962274071949/train/2 (pid 46648)] [0m[1mTask is starting.[0m
    [35m2022

We want to strip this output as well as the superflous clolors, which we can do with the `MetaflowPreamble` preprocessor:

In [13]:
c, _ = run_preprocessor([MetaflowPreamble], 'test_files/run_flow.ipynb', display_results=True)
assert 'Validating your flow...' not in c

```python
!python myflow.py run
```

    2022-02-15 13:57:54.075 Workflow starting (run-id 1644962274071949):
    2022-02-15 13:57:54.084 [1644962274071949/start/1 (pid 46643)] Task is starting.
    2022-02-15 13:57:54.703 [1644962274071949/start/1 (pid 46643)] this is the start
    2022-02-15 13:57:54.771 [1644962274071949/start/1 (pid 46643)] Task finished successfully.
    2022-02-15 13:57:54.779 [1644962274071949/train/2 (pid 46648)] Task is starting.
    2022-02-15 13:57:55.412 [1644962274071949/train/2 (pid 46648)] the train step
    2022-02-15 13:57:55.483 [1644962274071949/train/2 (pid 46648)] Task finished successfully.
    2022-02-15 13:57:55.491 [1644962274071949/end/3 (pid 46653)] Task is starting.
    2022-02-15 13:57:56.121 [1644962274071949/end/3 (pid 46653)] this is the end
    2022-02-15 13:57:56.189 [1644962274071949/end/3 (pid 46653)] Task finished successfully.
    2022-02-15 13:57:56.190 Done!



In [29]:
#export
class MetaflowSelectSteps(Preprocessor):
    """
    Hide Metaflow steps in output based on cell metadata.
    """
    re_step = r'.*\d+/{0}/\d+\s\(pid\s\d+\).*'
    
    def preprocess_cell(self, cell, resources, index):
        # import ipdb; ipdb.set_trace()
        steps = cell.metadata.get('nbdoc', {}).get('show_steps')
        if re.search('\s*python.+run.*', cell.source) and 'outputs' in cell and steps:
            for o in cell.outputs:
                if o.name == 'stdout':
                    final_steps = []
                    for s in steps.split(','):
                        found_steps = re.compile(self.re_step.format(s)).findall(o['text'])
                        if found_steps: 
                            final_steps += found_steps + ['...']
                    o['text'] = '\n'.join(final_steps)
        return cell, resources

`MetaflowSelectSteps` is meant to be used with `InjectMeta` to only show specific steps in the output logs from Metaflow.  

For example, if you want to only show the `start` and `train` steps in your flow, you would annotate your cell with the following pattern: `#cell_meta:show_steps=<step_name>`

In the below example, `#cell_meta:show_steps=start,train` shows the `start` and `train` steps, whereas `#cell_meta:show_steps=train` only shows the `train` step:

In [15]:
c, _ = run_preprocessor([InjectMeta, MetaflowSelectSteps], 
                        'test_files/run_flow_showstep.ipynb', 
                        display_results=True)
assert 'end' not in c

```python
#cell_meta:show_steps=start,train
!python myflow.py run
```

    [35m2022-02-15 14:01:14.810 [0m[32m[1644962474801237/start/1 (pid 46758)] [0m[1mTask is starting.[0m
    [35m2022-02-15 14:01:15.433 [0m[32m[1644962474801237/start/1 (pid 46758)] [0m[22mthis is the start[0m
    [35m2022-02-15 14:01:15.500 [0m[32m[1644962474801237/start/1 (pid 46758)] [0m[1mTask finished successfully.[0m
    ...
    [35m2022-02-15 14:01:15.507 [0m[32m[1644962474801237/train/2 (pid 46763)] [0m[1mTask is starting.[0m
    [35m2022-02-15 14:01:16.123 [0m[32m[1644962474801237/train/2 (pid 46763)] [0m[22mthe train step[0m
    [35m2022-02-15 14:01:16.188 [0m[32m[1644962474801237/train/2 (pid 46763)] [0m[1mTask finished successfully.[0m
    ...


```python
#cell_meta:show_steps=train
!python myflow.py run
```

    [35m2022-02-15 14:01:18.924 [0m[32m[1644962478210532/train/2 (pid 46783)] [0m[1mTask is starting.[0m
    [35m2022-02-15 14:01:19.566 [0m[32m[1644962

In [16]:
#export
class WriteTitle(Preprocessor):
    """Modify the code-fence with the filename upon %%writefile cell magic."""
    pattern = r'(^[\S\s]*%%writefile\s)(\S+)\n'
    
    def preprocess_cell(self, cell, resources, index):
        m = re.match(self.pattern, cell.source)
        if m: 
            filename = m.group(2)
            ext = filename.split('.')[-1]
            cell.metadata.magics_language = f'{ext} title="{filename}"'
            cell.outputs = []
        return cell, resources

`WriteTitle` creates the proper code-fence with a title in the situation where the `%%writefile` magic is used.

For example, here are contents before pre-processing:

In [17]:
show_plain_md('test_files/writefile.ipynb')

A test notebook


```python
%%writefile myflow.py
from metaflow import FlowSpec, step

class MyFlow(FlowSpec):
    
    @step
    def start(self):
        print('this is the start')
        self.next(self.train)
    
    @step
    def train(self):
        print('the train step')
        self.next(self.end)
    
    @step
    def end(self):
        print('this is the end')

if __name__ == '__main__':
    MyFlow()
```

    Overwriting myflow.py



```python
%%writefile hello.txt

Hello World
```

    Overwriting hello.txt




When we use `WriteTitle`, you will see the code-fence will change appropriately:

In [18]:
c, _ = run_preprocessor([WriteTitle], 'test_files/writefile.ipynb', display_results=True)
assert '```py title="myflow.py"' in c and '```txt title="hello.txt"' in c

A test notebook


```py title="myflow.py"
%%writefile myflow.py
from metaflow import FlowSpec, step

class MyFlow(FlowSpec):
    
    @step
    def start(self):
        print('this is the start')
        self.next(self.train)
    
    @step
    def train(self):
        print('the train step')
        self.next(self.end)
    
    @step
    def end(self):
        print('this is the end')

if __name__ == '__main__':
    MyFlow()
```


```txt title="hello.txt"
%%writefile hello.txt

Hello World
```



In [19]:
#export
class CleanMagics(Preprocessor):
    """A preprocessor to remove cell magic commands and #cell_meta: comments"""
    pattern = r'(^\s*(%%|%).+?[\n\r])|({0})'.format(_re_meta)
    
    def preprocess_cell(self, cell, resources, index):
        if cell.cell_type == 'code': 
            cell.source = re.sub(self.pattern, '', cell.source).strip()
        return cell, resources

`CleanMagics` strips magic cell commands `%%` so they do not appear in rendered markdown files:

In [20]:
c, _ = run_preprocessor([WriteTitle, CleanMagics], 'test_files/writefile.ipynb', display_results=True)
assert '%%' not in c

A test notebook


```py title="myflow.py"
from metaflow import FlowSpec, step

class MyFlow(FlowSpec):
    
    @step
    def start(self):
        print('this is the start')
        self.next(self.train)
    
    @step
    def train(self):
        print('the train step')
        self.next(self.end)
    
    @step
    def end(self):
        print('this is the end')

if __name__ == '__main__':
    MyFlow()
```


```txt title="hello.txt"
Hello World
```



Here is how `CleanMagics` Works on the file with the Metaflow log outputs from earlier, we can see that the `#cell_meta` comments are gone:

In [21]:
c, _ = run_preprocessor([InjectMeta, MetaflowSelectSteps, CleanMagics], 
                        'test_files/run_flow_showstep.ipynb', display_results=True)

```python
!python myflow.py run
```

    [35m2022-02-15 14:01:14.810 [0m[32m[1644962474801237/start/1 (pid 46758)] [0m[1mTask is starting.[0m
    [35m2022-02-15 14:01:15.433 [0m[32m[1644962474801237/start/1 (pid 46758)] [0m[22mthis is the start[0m
    [35m2022-02-15 14:01:15.500 [0m[32m[1644962474801237/start/1 (pid 46758)] [0m[1mTask finished successfully.[0m
    ...
    [35m2022-02-15 14:01:15.507 [0m[32m[1644962474801237/train/2 (pid 46763)] [0m[1mTask is starting.[0m
    [35m2022-02-15 14:01:16.123 [0m[32m[1644962474801237/train/2 (pid 46763)] [0m[22mthe train step[0m
    [35m2022-02-15 14:01:16.188 [0m[32m[1644962474801237/train/2 (pid 46763)] [0m[1mTask finished successfully.[0m
    ...


```python
!python myflow.py run
```

    [35m2022-02-15 14:01:18.924 [0m[32m[1644962478210532/train/2 (pid 46783)] [0m[1mTask is starting.[0m
    [35m2022-02-15 14:01:19.566 [0m[32m[1644962478210532/train/2 (pid 46783)] [0m[22mthe train step[0m
   

In [22]:
#hide
c, _ = run_preprocessor([WriteTitle, CleanMagics], 'test_files/hello_world.ipynb')
assert '#cell_meta' not in c

In [23]:
#export
class BashIdentify(Preprocessor):
    """A preprocessor to identify bash commands and mark them appropriately"""
    pattern = '^\s*!'
    
    def preprocess_cell(self, cell, resources, index):
        if cell.cell_type == 'code' and re.search(self.pattern, cell.source):
            cell.metadata.magics_language = 'bash'
            cell.source = re.sub(self.pattern, '', cell.source).strip()
        return cell, resources

When we issue a shell command in a notebook with `!`, we need to change the code-fence from `python` to `bash` and remove the `!`:

In [24]:
c, _ = run_preprocessor([MetaflowPreamble, BashIdentify], 'test_files/run_flow.ipynb', display_results=True)
assert "```bash" in c and '!python' not in c

```bash
python myflow.py run
```

    2022-02-15 13:57:54.075 Workflow starting (run-id 1644962274071949):
    2022-02-15 13:57:54.084 [1644962274071949/start/1 (pid 46643)] Task is starting.
    2022-02-15 13:57:54.703 [1644962274071949/start/1 (pid 46643)] this is the start
    2022-02-15 13:57:54.771 [1644962274071949/start/1 (pid 46643)] Task finished successfully.
    2022-02-15 13:57:54.779 [1644962274071949/train/2 (pid 46648)] Task is starting.
    2022-02-15 13:57:55.412 [1644962274071949/train/2 (pid 46648)] the train step
    2022-02-15 13:57:55.483 [1644962274071949/train/2 (pid 46648)] Task finished successfully.
    2022-02-15 13:57:55.491 [1644962274071949/end/3 (pid 46653)] Task is starting.
    2022-02-15 13:57:56.121 [1644962274071949/end/3 (pid 46653)] this is the end
    2022-02-15 13:57:56.189 [1644962274071949/end/3 (pid 46653)] Task finished successfully.
    2022-02-15 13:57:56.190 Done!



## Composing Preprocessors Into A Pipeline

Lets see how you can compose all of these preprocessors together to process notebooks appropriately:

In [25]:
#export
def get_mdx_exporter(template_file='ob.tpl'):
    """A mdx notebook exporter which composes many pre-processors together."""
    c = Config()
    c.TagRemovePreprocessor.remove_cell_tags = ("remove_cell",)
    c.TagRemovePreprocessor.remove_all_outputs_tags = ('remove_output',)
    c.TagRemovePreprocessor.remove_input_tags = ('remove_input',)
    c.MarkdownExporter.preprocessors = [InjectMeta, WriteTitle, CleanMagics, BashIdentify, MetaflowPreamble, MetaflowSelectSteps]
    tmp_dir = Path(__file__).parent/'templates/'
    tmp_file = tmp_dir/f"{template_file}"
    if not tmp_file.exists(): raise ValueError(f"{tmp_file} does not exist in {tmp_dir}")
    c.MarkdownExporter.template_file = str(tmp_file)
    return MarkdownExporter(config=c)

`get_mdx_exporter` combines all of the previous preprocessors, along with the built in `TagRemovePreprocessor` to allow for hiding cell inputs/outputs based on cell tags.  Here is an example of markdown generated from a notebook with the default preprocessing:

In [26]:
show_plain_md('test_files/example_input.ipynb')

---
title: my hello page title
description: my hello page description
hide_table_of_contents: true
---
## This is a test notebook

This is a shell command:


```python
! echo hello
```

    hello


We are writing a python script to disk:


```python
%%writefile myflow.py

from metaflow import FlowSpec, step

class MyFlow(FlowSpec):
    
    @step
    def start(self):
        print('this is the start')
        self.next(self.end)
    
    @step
    def end(self):
        print('this is the end')

if __name__ == '__main__':
    MyFlow()
```

    Overwriting myflow.py


Another shell command where we run a flow:


```python
#cell_meta:show_steps=start
! python myflow.py run
```

    [35m[1mMetaflow 2.5.0.post6+git62f5e52[0m[35m[22m executing [0m[31m[1mMyFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:hamel[0m[35m[22m[K[0m[35m[22m[0m
    [35m[22mValidating your flow...[K[0m[35m[22m[0m
    [32m[1m    The graph looks good![K[0m[32m[1m[0m
    [35m[22mRun

Here is the same notebook, but with all of the preprocessors that we defined in this module.  Additionally, we hide the input of the last cell which prints `hello, you should not see the print statement...` by using the built in `TagRemovePreprocessor`:

In [27]:
exp = get_mdx_exporter()
print(exp.from_filename('test_files/example_input.ipynb')[0])

---
title: my hello page title
description: my hello page description
hide_table_of_contents: true
---
## This is a test notebook

This is a shell command:


```bash
echo hello
```

    hello


We are writing a python script to disk:


```py title="myflow.py"
from metaflow import FlowSpec, step

class MyFlow(FlowSpec):
    
    @step
    def start(self):
        print('this is the start')
        self.next(self.end)
    
    @step
    def end(self):
        print('this is the end')

if __name__ == '__main__':
    MyFlow()
```

Another shell command where we run a flow:


```bash
python myflow.py run
```

    2022-02-15 14:11:09.224 [1644963069213536/start/1 (pid 46840)] Task is starting.
    2022-02-15 14:11:09.858 [1644963069213536/start/1 (pid 46840)] this is the start
    2022-02-15 14:11:09.929 [1644963069213536/start/1 (pid 46840)] Task finished successfully.
    ...

This is a normal python cell:




    2



The next cell has a cell tag of `remove_input`, so you should only see 