Fix handling of combining characters and mitigate homograph attacks #2

Merged
merged 2 commits into from Dec 13, 2011
View
2 slugify/__init__.py
@@ -11,7 +11,7 @@ def slugify(s, ok=SLUG_OK, lower=True, spaces=False):
# L and N signify letter/number.
# http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table
rv = []
- for c in smart_unicode(s):
+ for c in unicodedata.normalize('NFKC', smart_unicode(s)):
cat = unicodedata.category(c)[0]
if cat in 'LN' or c in ok:
rv.append(c)
View
6 slugify/tests.py
@@ -21,6 +21,12 @@ def check(x, y):
(' a ', 'a'),
('tags/', 'tags'),
('holy_wars', 'holy_wars'),
+ # Make sure we get a consistent result with decomposed chars:
+ (u'el ni\N{LATIN SMALL LETTER N WITH TILDE}o', u'el-ni\xf1o'),
@davedash
davedash Dec 13, 2011

this is amazing... the \N{something something}

+ (u'el nin\N{COMBINING TILDE}o', u'el-ni\xf1o'),
+ # Ensure we normalize appearance-only glyphs into their compatibility
+ # forms:
+ (u'\N{LATIN SMALL LIGATURE FI}lms', u'films'),
# I don't really care what slugify returns. Just don't crash.
(u'x𘍿', u'x'),
(u'ϧ΃𘒬𘓣', u'\u03e7'),