Skip to content

Commit

Permalink
Fix for #96, Italics parsing broken (#155)
Browse files Browse the repository at this point in the history
* Fix for #96, Italics parsing broken when underscore is followed by some characters.

The issue was that smart quotes, as well as any other non-ascii punctuation characters, were not handled like ascii punctuation in the parsing of emphasis/strong tokens.
Solved by including all unicode punctuation in the set of punctuation characters.

* Added test cases for emphasis without punctuation. Expecting different behavior for underscore and asterisk delimiters.
  • Loading branch information
anderskaplan committed Aug 21, 2022
1 parent 8fb47b6 commit 0c05dcd
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 3 deletions.
17 changes: 14 additions & 3 deletions mistletoe/core_tokens.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,25 @@
import re
import sys
from unicodedata import category


whitespace = {' ', '\t', '\n', '\x0b', '\x0c', '\r'}
unicode_whitespace = {'\t', '\n', '\x0b', '\x0c', '\r', '\x1c', '\x1d', '\x1e',
'\x1f', ' ', '\x85', '\xa0', '\u1680', '\u2000', '\u2001', '\u2002',
'\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009',
'\u200a', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000'}
punctuation = {'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',',
'-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\',
']', '^', '_', '`', '{', '|', '}', '~'}

# punctuation: _ASCII and Unicode punctuation characters_ as defined at
# <https://spec.commonmark.org/0.30/#ascii-punctuation-character> and
# <https://spec.commonmark.org/0.30/#unicode-punctuation-character>
unicode_chrs = (chr(i) for i in range(sys.maxunicode + 1))
punctuation = set.union(
{'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',',
'-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\',
']', '^', '_', '`', '{', '|', '}', '~'},
{c for c in unicode_chrs if category(c).startswith("P")},
)

code_pattern = re.compile(r"(?<!\\|`)(?:\\\\)*(`+)(?!`)(.+?)(?<!`)\1(?!`)", re.DOTALL)


Expand Down
19 changes: 19 additions & 0 deletions test/test_span_token.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,25 @@ def test_parse(self):
self._test_parse(span_token.Emphasis, '*some text*', 'some text')
self._test_parse(span_token.Emphasis, '_some text_', 'some text')

def test_emphasis_with_straight_quote(self):
tokens = iter(span_token.tokenize_inner('_Book Title_\'s author'))
self._test_token(next(tokens), 'Book Title', children=True)
self._test_token(next(tokens), '\'s author', children=False)

def test_emphasis_with_smart_quote(self):
tokens = iter(span_token.tokenize_inner('_Book Title_’s author'))
self._test_token(next(tokens), 'Book Title', children=True)
self._test_token(next(tokens), '’s author', children=False)

def test_no_emphasis_for_underscore_without_punctuation(self):
tokens = iter(span_token.tokenize_inner('_an example without_punctuation'))
self._test_token(next(tokens), '_an example without_punctuation', children=True)

def test_emphasis_for_asterisk_without_punctuation(self):
tokens = iter(span_token.tokenize_inner('*an example without*punctuation'))
self._test_token(next(tokens), 'an example without', children=True)
self._test_token(next(tokens), 'punctuation', children=False)


class TestInlineCode(TestBranchToken):
def _test_parse_enclosed(self, encl_type, encl_delimiter):
Expand Down

0 comments on commit 0c05dcd

Please sign in to comment.