Browse files

Just return non-ascii words as-is.

  • Loading branch information...
1 parent 02b0cfd commit 81d3932ab4f1002763da1e5e746cdbd055f2a119 @mdirolf committed Nov 18, 2011
Showing with 8 additions and 1 deletion.
  1. +8 −1 Stemmer.py
View
9 Stemmer.py
@@ -337,7 +337,10 @@ def _stem(cls, word):
if isinstance(word, unicode):
was_unicode = True
- word = word.encode('utf-8')
+ try:
+ word = word.encode('ascii')
+ except:
+ return word
if len(word) <= 2:
return word
@@ -716,6 +719,10 @@ def testStem(self):
self.assertEqual(stemmer.stemWord('exceeding'), 'exceed')
self.assertEqual(stemmer.stemWord('succeeds'), 'succeed')
+ # Non-ascii
+ self.assertEqual(stemmer.stemWord(u'czy\u017ce'), u'czy\u017ce')
+ self.assertEqual(stemmer.stemWord(u'eug\xe8neysa\xffe'), u'eug\xe8neysa\xffe')
+
# hardcore test
infile = open('./voc.txt', 'r')
outfile = open('./stemmedvoc.txt', 'r')

0 comments on commit 81d3932

Please sign in to comment.