From a32d0fcf0ee16389d20efeba021f3046675afb13 Mon Sep 17 00:00:00 2001 From: Steven Myint Date: Sun, 31 Mar 2013 07:27:31 -0700 Subject: [PATCH] Split on all non-words Previously, there were some special cases (like "<"). This change takes care of all non-words instead of just special cases. This resolves item 3 of issue #16 in an alternate way. --- misspellings_lib.py | 2 +- tests/test_class.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/misspellings_lib.py b/misspellings_lib.py index 5fc2c79..5cf9311 100644 --- a/misspellings_lib.py +++ b/misspellings_lib.py @@ -15,7 +15,7 @@ import string _NORM_REGEX = re.compile('([a-z])([A-Z][a-z])') -_WORD_REGEX = re.compile('[\s_0-9<>/,\.]+') +_WORD_REGEX = re.compile('[\s_0-9\W]+', flags=re.UNICODE) def normalize(word): diff --git a/tests/test_class.py b/tests/test_class.py index 918a2a0..ee3aeb4 100755 --- a/tests/test_class.py +++ b/tests/test_class.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- # For Python 2.5 from __future__ import with_statement @@ -135,8 +136,12 @@ def testSplitWordsWithCamelCase(self): self.assertEqual(['one', 'Two', 'Three', 'four', 'five'], misspellings.split_words('oneTwoThree_four five')) - def testNormalize(self): - self.assertEqual('alpha', misspellings.normalize('"alpha".')) + def testSplitWordsWithOtherCharacters(self): + self.assertEqual(['the', 'big', 'cat'], + misspellings.split_words('the%big$cat')) + + def testNormalize(self): + self.assertEqual('alpha', misspellings.normalize('"alpha".')) if __name__ == '__main__':