Skip to content

Commit

Permalink
Add support for - and * bullet style in the metadata cell
Browse files Browse the repository at this point in the history
Made the metadata extraction less strict: add support for `-` and `*` bullet style,
title doesn't have to be first line, title can also be '##' or higher as well.
Simplified error handling during extraction, but made exception more helpful.

Also added some unittests for the metadata cell parsing
  • Loading branch information
soxofaan committed May 10, 2018
1 parent 33290cf commit ec532ca
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 27 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,18 @@ Exact Idea of MetaCell is:

> **All and Only** Metadata should be stored at the first Cell of ipynb
Writing a MetaCell is as simple as writting metadata in markdown file.
Writing a MetaCell is as simple as writing metadata in markdown file.
```md
# This is title
+ date: 2020-02-22
+ tags: hello, world
```
> Hint: In jupyter notebook, press `Esc+M` will switch selected cell to markdown mode.
The `+` and space ` ` before any item will be automatically stripped out. If you don't like it, it's OK. But adding `+ ` will make metadata organized and enhance the readability of metacell:
It is recommended (but not required) to organize the metadata items
with Markdown bullets (`+`, `-` or `*`).
These will be stripped during metadata extraction,
but it keeps the metadata cell nicely readable in the notebook, like this:

### This is title
+ date: 2020-02-22
Expand Down
54 changes: 29 additions & 25 deletions preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,45 @@
import markdown


class MetaDataExtractionFailure(Exception):
pass


class Metadata(Preprocessor):
'''Extract Metadata from first cell. '''
"""Preprocessor to extract metadata from first cell of notebook."""
data = {}
md = None

# Regex for 'key: value' syntax
key_value_regex = re.compile(
r'^\s*[*+-]?\s*(?P<key>[a-zA-Z]+)\s*:\s*(?P<value>.*)$')

@staticmethod
def meta_cell(cell):
'''Process the first cell'''
lines = cell.split('\n')
if lines[0].startswith('# '):
lines[0] = 'title: ' + lines[0][2:]
lines = [l.lstrip("+ ") for l in lines]
def extract_cell_metadata(cell):
"""Extract metadata from the given notebook cell source."""
# Convert Markdown title syntax to 'title:'
cell = re.sub(r'^#+\s*', 'title: ', cell, flags=re.MULTILINE)

# Extract metadata from key-value pairs in non-empty lines
lines = [line.strip() for line in cell.split('\n') if line.strip()]
metadata = {}
for line in lines:
if ':' not in line:
data = {}
return False
key, val = line.split(':', 1)
key = key.strip().lower()
val = val.strip()
if key:
Metadata.data[key] = val
else:
return False
return True
match = Metadata.key_value_regex.match(line)
if not match:
raise MetaDataExtractionFailure(
'Failed to extract metadata with {l!r}'.format(l=line))
key, value = match.group('key', 'value')
metadata[key.lower()] = value
return metadata

@staticmethod
def preprocess(nb, resources):
'''Process the notebook to extract metadata'''
Metadata.data = {}
if Metadata.meta_cell(nb.cells[0]['source']):
nb.cells = nb.cells[1:]
if not nb.cells:
raise Exception('No content cells after metadata extraction!')
else:
raise Exception('Failure in metadata extraction!')
Metadata.data = Metadata.extract_cell_metadata(nb.cells[0]['source'])
nb.cells = nb.cells[1:]
if not nb.cells:
raise Exception('No content cells after metadata extraction!')

if 'summary' in Metadata.data:
Metadata.data['summary'] = Metadata.md.convert(
Metadata.data['summary'])
Expand Down
Empty file added test/__init__.py
Empty file.
62 changes: 62 additions & 0 deletions test/test_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import textwrap

from ..preprocess import Metadata

import unittest


class MetadataTest(unittest.TestCase):

def test_extract_cell_metadata_basic(self):
metadata = Metadata.extract_cell_metadata(textwrap.dedent("""\
# Animal Farm
+ Author: George Orwell
+ Date: 1945-08-17
"""))
expected = {
'title': 'Animal Farm',
'author': 'George Orwell',
'date': '1945-08-17',
}
self.assertEqual(expected, metadata)

def test_extract_cell_metadata_other_bullets(self):
metadata = Metadata.extract_cell_metadata(textwrap.dedent("""\
# Animal Farm
- Author: George Orwell
* Date: 1945-08-17
Tags: books
"""))
expected = {
'title': 'Animal Farm',
'author': 'George Orwell',
'date': '1945-08-17',
'tags': 'books',
}
self.assertEqual(expected, metadata)

def test_extract_cell_metadata_title_variation(self):
metadata = Metadata.extract_cell_metadata(textwrap.dedent("""\
+ Author: George Orwell
## Animal Farm
+ Date: 1945-08-17
"""))
expected = {
'title': 'Animal Farm',
'author': 'George Orwell',
'date': '1945-08-17',
}
self.assertEqual(expected, metadata)

def test_extract_cell_metadata_whitespace(self):
metadata = Metadata.extract_cell_metadata(textwrap.dedent("""\
# Animal Farm
+ Author : George Orwell
+ Date : 1945-08-17
"""))
expected = {
'title': 'Animal Farm',
'author': 'George Orwell',
'date': '1945-08-17',
}
self.assertEqual(expected, metadata)

0 comments on commit ec532ca

Please sign in to comment.