Fix for #96, Italics parsing broken (#155)

* Fix for #96, Italics parsing broken when underscore is followed by some characters. The issue was that smart quotes, as well as any other non-ascii punctuation characters, were not handled like ascii punctuation in the parsing of emphasis/strong tokens. Solved by including all unicode punctuation in the set of punctuation characters. * Added test cases for emphasis without punctuation. Expecting different behavior for underscore and asterisk delimiters.
miyuchina · Aug 21, 2022 · 0c05dcd · 0c05dcd
1 parent 8fb47b6
commit 0c05dcd
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 3 deletions.
diff --git a/mistletoe/core_tokens.py b/mistletoe/core_tokens.py
@@ -1,14 +1,25 @@
 import re
+import sys
+from unicodedata import category
 
 
 whitespace = {' ', '\t', '\n', '\x0b', '\x0c', '\r'}
 unicode_whitespace = {'\t', '\n', '\x0b', '\x0c', '\r', '\x1c', '\x1d', '\x1e',
         '\x1f', ' ', '\x85', '\xa0', '\u1680', '\u2000', '\u2001', '\u2002',
         '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009',
         '\u200a', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000'}
-punctuation = {'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',',
-               '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\',
-               ']', '^', '_', '`', '{', '|', '}', '~'}
+
+# punctuation: _ASCII and Unicode punctuation characters_ as defined at
+# <https://spec.commonmark.org/0.30/#ascii-punctuation-character> and
+# <https://spec.commonmark.org/0.30/#unicode-punctuation-character>
+unicode_chrs = (chr(i) for i in range(sys.maxunicode + 1))
+punctuation = set.union(
+        {'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',',
+        '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\',
+        ']', '^', '_', '`', '{', '|', '}', '~'},
+        {c for c in unicode_chrs if category(c).startswith("P")},
+)
+
 code_pattern = re.compile(r"(?<!\\|`)(?:\\\\)*(`+)(?!`)(.+?)(?<!`)\1(?!`)", re.DOTALL)
 
 

diff --git a/test/test_span_token.py b/test/test_span_token.py
@@ -35,6 +35,25 @@ def test_parse(self):
         self._test_parse(span_token.Emphasis, '*some text*', 'some text')
         self._test_parse(span_token.Emphasis, '_some text_', 'some text')
 
+    def test_emphasis_with_straight_quote(self):
+        tokens = iter(span_token.tokenize_inner('_Book Title_\'s author'))
+        self._test_token(next(tokens), 'Book Title', children=True)
+        self._test_token(next(tokens), '\'s author', children=False)
+
+    def test_emphasis_with_smart_quote(self):
+        tokens = iter(span_token.tokenize_inner('_Book Title_’s author'))
+        self._test_token(next(tokens), 'Book Title', children=True)
+        self._test_token(next(tokens), '’s author', children=False)
+
+    def test_no_emphasis_for_underscore_without_punctuation(self):
+        tokens = iter(span_token.tokenize_inner('_an example without_punctuation'))
+        self._test_token(next(tokens), '_an example without_punctuation', children=True)
+
+    def test_emphasis_for_asterisk_without_punctuation(self):
+        tokens = iter(span_token.tokenize_inner('*an example without*punctuation'))
+        self._test_token(next(tokens), 'an example without', children=True)
+        self._test_token(next(tokens), 'punctuation', children=False)
+
 
 class TestInlineCode(TestBranchToken):
     def _test_parse_enclosed(self, encl_type, encl_delimiter):