继续完善对非中文字符的处理

mozillazg · Jun 22, 2015 · 3d52fe8 · 3d52fe8
1 parent d700a65
commit 3d52fe8
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 31 deletions.
diff --git a/pypinyin/__init__.py b/pypinyin/__init__.py
@@ -62,6 +62,12 @@
     |[\u4e00-\u9fff]    # CJK 基本:[4E00-9FFF]
     |[\uf900-\ufaff]    # CJK 兼容:[F900-FAFF]
 )+$''', re.X)
+# 没有拼音的字符
+RE_NONE_HANS = re.compile(r'''^(?:
+    [^\u3400-\u4dbf
+     \u4e00-\u9fff
+     \uf900-\ufaff]
+)+$''', re.X)
 # 分割中文字符和非中文字符
 RE_NONE_HANS_SPLIT = re.compile(r'''
 (?:
@@ -128,12 +134,17 @@ def simple_seg(hans):
     if isinstance(hans, unicode):
         return RE_NONE_HANS_SPLIT.sub('\b', hans).split('\b')
     else:
+        hans = list(hans)
+        if len(hans) == 1:
+            return simple_seg(hans[0])
         return list(chain(*[simple_seg(x) for x in hans]))
 
 
 def seg(hans):
     if getattr(seg, 'no_jieba', None):
         ret = hans
+        return simple_seg(ret)
+
     if seg.jieba is None:
         try:
             import jieba
@@ -142,9 +153,15 @@ def seg(hans):
             seg.no_jieba = True
         return seg(hans)
     else:
-        ret = seg.jieba.cut(hans)
+        hans = simple_seg(hans)
+        ret = []
+        for x in hans:
+            if RE_NONE_HANS.match(x):   # 没有拼音的字符，不再参与二次分词
+                ret.append(x)
+            else:
+                ret.extend(list(seg.jieba.cut(x)))
+        return ret
 
-    return simple_seg(ret)
 seg.jieba = None
 if os.environ.get('PYPINYIN_NO_JIEBA'):
     seg.no_jieba = True

diff --git a/tests/_test_env.py b/tests/_test_env.py
@@ -6,13 +6,10 @@
 os.environ['PYPINYIN_NO_PHRASES'] = 'true'
 os.environ['PYPINYIN_NO_JIEBA'] = 'true'
 
-import pytest
-
 import pypinyin
-from .utils import has_module
 
 
 def test_env():
     assert pypinyin.PHRASES_DICT == {}
     assert pypinyin.seg.no_jieba
-    assert pypinyin.seg('北京') == '北京'
+    assert pypinyin.seg('北京') == ['北京']
diff --git a/tests/test_pinyin.py b/tests/test_pinyin.py
@@ -43,11 +43,7 @@ def test_pinyin_finals():
     """只包含韵母的词语"""
     hans = '嗷嗷'
     assert pinyin(hans) == [['\xe1o'], ['\xe1o']]
-    try:
-        assert pinyin(hans + 'abc') == [['\xe1o'], ['\xe1o'], ['abc']]
-    except AssertionError:
-        assert pinyin(hans + 'abc') == [['\xe1o'], ['\xe1o'],
-                                        ['a'], ['b'], ['c']]
+    assert pinyin(hans + 'abc') == [['\xe1o'], ['\xe1o'], ['abc']]
     assert pinyin(hans, NORMAL) == [['ao'], ['ao']]
     assert pinyin(hans, TONE) == [['\xe1o'], ['\xe1o']]
     assert pinyin(hans, TONE2) == [['a2o'], ['a2o']]
@@ -71,21 +67,14 @@ def test_zh_and_en():
     """中英文混合的情况"""
     # 中英文
     hans = '中心'
-    if has_module('jieba'):
-        assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']]
-    else:
-        assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'],
-                                        ['a'], ['b'], ['c']]
+    assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']]
     # 中英文混合的固定词组
     assert pinyin('黄山B股', style=TONE2) == [['hua2ng'], ['sha1n'], ['B'], ['gu3']]
     assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
     assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
     assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
-    assert pinyin('AB超C', style=TONE2) == [['A'], ['B'], ['cha1o'], ['C']]
-    if has_module('jieba'):
-        assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
-    else:
-        assert pinyin('AB阿C', style=TONE2) == [['A'], ['B'], ['a1'], ['C']]
+    assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
+    assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
     assert pinyin('维生素C', style=TONE2) == [['we2i'], ['she1ng'], ['su4'], ['C']]
 
 
@@ -120,16 +109,15 @@ def test_seg_jieba():
     assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
     assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
     assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
-    assert pinyin('AB超C', style=TONE2) == [['A'], ['B'], ['cha1o'], ['C']]
+    assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
     assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
     assert pinyin('维生素C', style=TONE2) == [['we2i'], ['she1ng'], ['su4'], ['C']]
 
 
 @pytest.mark.skipif(not has_module('snownlp'), reason='cant import snownlp')
 def test_other_seg_module():
-    hans = '音乐123'
-    assert lazy_pinyin(hans, style=TONE2) == [u'yi1n', u'le4', u'1', u'2', u'3']
     from snownlp import SnowNLP
+    hans = '音乐123'
     hans_seg = SnowNLP(hans).words
     assert lazy_pinyin(hans_seg, style=TONE2) == [u'yi1n', u'yue4', u'123']
 
@@ -170,16 +158,16 @@ def test_errors():
 
 
 def test_errors_callable():
-    def foobar(char):
-        return 'a'
+    def foobar(chars):
+        return 'a' * len(chars)
 
     class Foobar(object):
-        def __call__(self, char):
-            return 'a'
+        def __call__(self, chars):
+            return 'a' * len(chars)
 
     n = 5
-    assert lazy_pinyin('あ' * n, errors=foobar) == ['a'] * n
-    assert lazy_pinyin('あ' * n, errors=Foobar()) == ['a'] * n
+    assert lazy_pinyin('あ' * n, errors=foobar) == ['a' * n]
+    assert lazy_pinyin('あ' * n, errors=Foobar()) == ['a' * n]
 
 
 def test_update():
@@ -221,4 +209,4 @@ def test_simple_seg():
     hans = '你好にほんごРусский язык'
     ret = 'ni3 ha3o'
     errors = lambda x: None
-    assert slug(hans, style=TONE2, separator=' ', errors=errors)
+    assert slug(hans, style=TONE2, separator=' ', errors=errors) == ret