Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use str.isidentifier to match idents on python 3 #731

Merged
merged 9 commits into from Jul 4, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions .travis.yml
@@ -1,5 +1,6 @@
sudo: false
language: python

python:
- "2.6"
- "2.7"
Expand All @@ -8,10 +9,18 @@ python:
- "3.4"
- "3.5"
- "3.6"

install:
- pip install tox

script:
- tox -e py

branches:
only:
- master
- /^.*-maintenance$/

notifications:
email: false
irc:
Expand Down
2 changes: 2 additions & 0 deletions jinja2/_identifier.py
@@ -0,0 +1,2 @@
# generated by scripts/generate_identifier_pattern.py
pattern = '·̀-ͯ·҃-֑҇-ׇֽֿׁׂׅׄؐ-ًؚ-ٰٟۖ-ۜ۟-۪ۤۧۨ-ܑۭܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣔ-ࣣ࣡-ःऺ-़ा-ॏ॑-ॗॢॣঁ-ঃ়া-ৄেৈো-্ৗৢৣਁ-ਃ਼ਾ-ੂੇੈੋ-੍ੑੰੱੵઁ-ઃ઼ા-ૅે-ૉો-્ૢૣଁ-ଃ଼ା-ୄେୈୋ-୍ୖୗୢୣஂா-ூெ-ைொ-்ௗఀ-ఃా-ౄె-ైొ-్ౕౖౢౣಁ-ಃ಼ಾ-ೄೆ-ೈೊ-್ೕೖೢೣഁ-ഃാ-ൄെ-ൈൊ-്ൗൢൣංඃ්ා-ුූෘ-ෟෲෳัิ-ฺ็-๎ັິ-ູົຼ່-ໍ༹༘༙༵༷༾༿ཱ-྄྆྇ྍ-ྗྙ-ྼ࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒᝓᝲᝳ឴-៓៝᠋-᠍ᢅᢆᢩᤠ-ᤫᤰ-᤻ᨗ-ᨛᩕ-ᩞ᩠-᩿᩼᪰-᪽ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭ᳲ-᳴᳸᳹᷀-᷵᷻-᷿‿⁀⁔⃐-⃥⃜⃡-⃰℘℮⳯-⵿⳱ⷠ-〪ⷿ-゙゚〯꙯ꙴ-꙽ꚞꚟ꛰꛱ꠂ꠆ꠋꠣ-ꠧꢀꢁꢴ-ꣅ꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꧥꨩ-ꨶꩃꩌꩍꩻ-ꩽꪰꪲ-ꪴꪷꪸꪾ꪿꫁ꫫ-ꫯꫵ꫶ꯣ-ꯪ꯬꯭ﬞ︀-️︠-︯︳︴﹍-﹏_𐇽𐋠𐍶-𐍺𐨁-𐨃𐨅𐨆𐨌-𐨏𐨸-𐨿𐨺𐫦𐫥𑀀-𑀂𑀸-𑁆𑁿-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑅳𑄴𑆀-𑆂𑆳-𑇊𑇀-𑇌𑈬-𑈷𑈾𑋟-𑋪𑌀-𑌃𑌼𑌾-𑍄𑍇𑍈𑍋-𑍍𑍗𑍢𑍣𑍦-𑍬𑍰-𑍴𑐵-𑑆𑒰-𑓃𑖯-𑖵𑖸-𑗀𑗜𑗝𑘰-𑙀𑚫-𑚷𑜝-𑜫𑰯-𑰶𑰸-𑰿𑲒-𑲧𑲩-𑲶𖫰-𖫴𖬰-𖬶𖽑-𖽾𖾏-𖾒𛲝𛲞𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝨀-𝨶𝨻-𝩬𝩵𝪄𝪛-𝪟𝪡-𝪯𞀀-𞀆𞀈-𞀘𞀛-𞀡𞀣𞀤𞀦-𞣐𞀪-𞣖𞥄-𞥊󠄀-󠇯'
71 changes: 0 additions & 71 deletions jinja2/_stringdefs.py

This file was deleted.

51 changes: 25 additions & 26 deletions jinja2/lexer.py
Expand Up @@ -15,14 +15,12 @@
:license: BSD, see LICENSE for more details.
"""
import re
import sys

from operator import itemgetter
from collections import deque
from operator import itemgetter

from jinja2._compat import implements_iterator, intern, iteritems, text_type
from jinja2.exceptions import TemplateSyntaxError
from jinja2.utils import LRUCache
from jinja2._compat import iteritems, implements_iterator, text_type, intern


# cache for the lexers. Exists in order to be able to have multiple
# environments with the same lexer
Expand All @@ -34,28 +32,25 @@
r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
integer_re = re.compile(r'\d+')

def _make_name_re():
try:
compile('föö', '<unknown>', 'eval')
except SyntaxError:
return re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')

try:
# check if this Python supports Unicode identifiers
compile('föö', '<unknown>', 'eval')
except SyntaxError:
# no Unicode support, use ASCII identifiers
name_re = re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*')
check_ident = False
else:
# Unicode support, build a pattern to match valid characters, and set flag
# to use str.isidentifier to validate during lexing
from jinja2 import _identifier
name_re = re.compile(r'[\w{0}]+'.format(_identifier.pattern))
check_ident = True
# remove the pattern from memory after building the regex
import sys
del sys.modules['jinja2._identifier']
import jinja2
from jinja2 import _stringdefs
name_re = re.compile(r'[%s][%s]*' % (_stringdefs.xid_start,
_stringdefs.xid_continue))

# Save some memory here
sys.modules.pop('jinja2._stringdefs')
del _stringdefs
del jinja2._stringdefs

return name_re

# we use the unicode identifier rule if this python version is able
# to handle unicode identifiers, otherwise the standard ASCII one.
name_re = _make_name_re()
del _make_name_re
del jinja2._identifier
del _identifier

float_re = re.compile(r'(?<!\.)\d+\.\d+')
newline_re = re.compile(r'(\r\n|\r|\n)')
Expand Down Expand Up @@ -577,6 +572,10 @@ def wrap(self, stream, name=None, filename=None):
token = value
elif token == 'name':
value = str(value)
if check_ident and not value.isidentifier():
raise TemplateSyntaxError(
'Invalid character in identifier',
lineno, name, filename)
elif token == 'string':
# try to unescape string
try:
Expand Down
77 changes: 77 additions & 0 deletions scripts/generate_identifier_pattern.py
@@ -0,0 +1,77 @@
#!/usr/bin/env python3
import itertools
import os
import re
import sys

if sys.version_info[0] < 3:
raise RuntimeError('This needs to run on Python 3.')


def get_characters():
"""Find every Unicode character that is valid in a Python `identifier`_ but
is not matched by the regex ``\w`` group.

``\w`` matches some characters that aren't valid in identifiers, but
:meth:`str.isidentifier` will catch that later in lexing.

All start characters are valid continue characters, so we only test for
continue characters.

_identifier: https://docs.python.org/3/reference/lexical_analysis.html#identifiers
"""
for cp in range(sys.maxunicode + 1):
s = chr(cp)

if ('a' + s).isidentifier() and not re.match(r'\w', s):
yield s


def collapse_ranges(data):
"""Given a sorted list of unique characters, generate ranges representing
sequential code points.

Source: https://stackoverflow.com/a/4629241/400617
"""
for a, b in itertools.groupby(
enumerate(data),
lambda x: ord(x[1]) - x[0]
):
b = list(b)
yield b[0][1], b[-1][1]


def build_pattern(ranges):
"""Output the regex pattern for ranges of characters.

One and two character ranges output the individual characters.
"""
out = []

for a, b in ranges:
if a == b: # single char
out.append(a)
elif ord(b) - ord(a) == 1: # two chars, range is redundant
out.append(a)
out.append(b)
else:
out.append(f'{a}-{b}')

return ''.join(out)


def main():
"""Build the regex pattern and write it to the file
:file:`jinja2/_identifier.py`."""
pattern = build_pattern(collapse_ranges(get_characters()))
filename = os.path.abspath(os.path.join(
os.path.dirname(__file__), '..', 'jinja2', '_identifier.py'
))

with open(filename, 'w', encoding='utf8') as f:
f.write('# generated by scripts/generate_identifier_pattern.py\n')
f.write(f'pattern = \'{pattern}\'\n')


if __name__ == '__main__':
main()
27 changes: 27 additions & 0 deletions tests/test_lexnparse.py
Expand Up @@ -126,6 +126,33 @@ def test_trailing_newline(self, env):
result = tmpl.render()
assert result == expect, (keep, template, result, expect)

@pytest.mark.parametrize('name,valid2,valid3', (
(u'foo', True, True),
(u'föö', False, True),
(u'き', False, True),
(u'_', True, True),
(u'1a', False, False), # invalid ascii start
(u'a-', False, False), # invalid ascii continue
(u'🐍', False, False), # invalid unicode start
(u'a🐍', False, False), # invalid unicode continue
# start characters not matched by \w
(u'\u1885', False, True),
(u'\u1886', False, True),
(u'\u2118', False, True),
(u'\u212e', False, True),
# continue character not matched by \w
(u'\xb7', False, False),
(u'a\xb7', False, True),
))
def test_name(self, env, name, valid2, valid3):
t = u'{{ ' + name + u' }}'

if (valid2 and PY2) or (valid3 and not PY2):
# valid for version being tested, shouldn't raise
env.from_string(t)
else:
pytest.raises(TemplateSyntaxError, env.from_string, t)


@pytest.mark.lexnparse
@pytest.mark.parser
Expand Down