Add support for - and * bullet style in the metadata cell

Made the metadata extraction less strict: add support for `-` and `*` bullet style, title doesn't have to be first line, title can also be '##' or higher as well. Simplified error handling during extraction, but made exception more helpful. Also added some unittests for the metadata cell parsing
peijunz · May 10, 2018 · ec532ca · ec532ca
1 parent 33290cf
commit ec532ca
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -10,15 +10,18 @@ Exact Idea of MetaCell is:
 
 > **All and Only** Metadata should be stored at the first Cell of ipynb
 
-Writing a MetaCell is as simple as writting metadata in markdown file.
+Writing a MetaCell is as simple as writing metadata in markdown file.
 ```md
 # This is title
 + date: 2020-02-22
 + tags: hello, world
 ```
 > Hint: In jupyter notebook, press `Esc+M` will switch selected cell to markdown mode. 
 
-The `+` and space ` ` before any item will be automatically stripped out. If you don't like it, it's OK. But adding `+ ` will make metadata organized and enhance the readability of metacell:
+It is recommended (but not required) to organize the metadata items
+with Markdown bullets (`+`, `-` or `*`).
+These will be stripped during metadata extraction,
+but it keeps the metadata cell nicely readable in the notebook, like this:
 
 ### This is title
 + date: 2020-02-22

diff --git a/preprocess.py b/preprocess.py
@@ -4,41 +4,45 @@
 import markdown
 
 
+class MetaDataExtractionFailure(Exception):
+    pass
+
+
 class Metadata(Preprocessor):
-    '''Extract Metadata from first cell. '''
+    """Preprocessor to extract metadata from first cell of notebook."""
     data = {}
     md = None
 
+    # Regex for 'key: value' syntax
+    key_value_regex = re.compile(
+        r'^\s*[*+-]?\s*(?P<key>[a-zA-Z]+)\s*:\s*(?P<value>.*)$')
+
     @staticmethod
-    def meta_cell(cell):
-        '''Process the first cell'''
-        lines = cell.split('\n')
-        if lines[0].startswith('# '):
-            lines[0] = 'title: ' + lines[0][2:]
-        lines = [l.lstrip("+ ") for l in lines]
+    def extract_cell_metadata(cell):
+        """Extract metadata from the given notebook cell source."""
+        # Convert Markdown title syntax to 'title:'
+        cell = re.sub(r'^#+\s*', 'title: ', cell, flags=re.MULTILINE)
+
+        # Extract metadata from key-value pairs in non-empty lines
+        lines = [line.strip() for line in cell.split('\n') if line.strip()]
+        metadata = {}
         for line in lines:
-            if ':' not in line:
-                data = {}
-                return False
-            key, val = line.split(':', 1)
-            key = key.strip().lower()
-            val = val.strip()
-            if key:
-                Metadata.data[key] = val
-            else:
-                return False
-        return True
+            match = Metadata.key_value_regex.match(line)
+            if not match:
+                raise MetaDataExtractionFailure(
+                    'Failed to extract metadata with {l!r}'.format(l=line))
+            key, value = match.group('key', 'value')
+            metadata[key.lower()] = value
+        return metadata
 
     @staticmethod
     def preprocess(nb, resources):
         '''Process the notebook to extract metadata'''
-        Metadata.data = {}
-        if Metadata.meta_cell(nb.cells[0]['source']):
-            nb.cells = nb.cells[1:]
-            if not nb.cells:
-                raise Exception('No content cells after metadata extraction!')
-        else:
-            raise Exception('Failure in metadata extraction!')
+        Metadata.data = Metadata.extract_cell_metadata(nb.cells[0]['source'])
+        nb.cells = nb.cells[1:]
+        if not nb.cells:
+            raise Exception('No content cells after metadata extraction!')
+
         if 'summary' in Metadata.data:
             Metadata.data['summary'] = Metadata.md.convert(
                 Metadata.data['summary'])

diff --git a/test/__init__.py b/test/__init__.py
diff --git a/test/test_preprocess.py b/test/test_preprocess.py
@@ -0,0 +1,62 @@
+import textwrap
+
+from ..preprocess import Metadata
+
+import unittest
+
+
+class MetadataTest(unittest.TestCase):
+
+    def test_extract_cell_metadata_basic(self):
+        metadata = Metadata.extract_cell_metadata(textwrap.dedent("""\
+            # Animal Farm
+            + Author: George Orwell
+            + Date: 1945-08-17
+        """))
+        expected = {
+            'title': 'Animal Farm',
+            'author': 'George Orwell',
+            'date': '1945-08-17',
+        }
+        self.assertEqual(expected, metadata)
+
+    def test_extract_cell_metadata_other_bullets(self):
+        metadata = Metadata.extract_cell_metadata(textwrap.dedent("""\
+            # Animal Farm
+            - Author: George Orwell
+            * Date: 1945-08-17
+            Tags: books
+        """))
+        expected = {
+            'title': 'Animal Farm',
+            'author': 'George Orwell',
+            'date': '1945-08-17',
+            'tags': 'books',
+        }
+        self.assertEqual(expected, metadata)
+
+    def test_extract_cell_metadata_title_variation(self):
+        metadata = Metadata.extract_cell_metadata(textwrap.dedent("""\
+            + Author: George Orwell
+            ## Animal Farm
+            + Date: 1945-08-17
+        """))
+        expected = {
+            'title': 'Animal Farm',
+            'author': 'George Orwell',
+            'date': '1945-08-17',
+        }
+        self.assertEqual(expected, metadata)
+
+    def test_extract_cell_metadata_whitespace(self):
+        metadata = Metadata.extract_cell_metadata(textwrap.dedent("""\
+            #  Animal Farm
+               +  Author   :   George Orwell
+               +  Date     :   1945-08-17
+        """))
+        expected = {
+            'title': 'Animal Farm',
+            'author': 'George Orwell',
+            'date': '1945-08-17',
+        }
+        self.assertEqual(expected, metadata)