Skip to content

Commit

Permalink
继续完善对非中文字符的处理
Browse files Browse the repository at this point in the history
  • Loading branch information
mozillazg committed Jun 22, 2015
1 parent d700a65 commit 3d52fe8
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 31 deletions.
21 changes: 19 additions & 2 deletions pypinyin/__init__.py
Expand Up @@ -62,6 +62,12 @@
|[\u4e00-\u9fff] # CJK 基本:[4E00-9FFF]
|[\uf900-\ufaff] # CJK 兼容:[F900-FAFF]
)+$''', re.X)
# 没有拼音的字符
RE_NONE_HANS = re.compile(r'''^(?:
[^\u3400-\u4dbf
\u4e00-\u9fff
\uf900-\ufaff]
)+$''', re.X)
# 分割中文字符和非中文字符
RE_NONE_HANS_SPLIT = re.compile(r'''
(?:
Expand Down Expand Up @@ -128,12 +134,17 @@ def simple_seg(hans):
if isinstance(hans, unicode):
return RE_NONE_HANS_SPLIT.sub('\b', hans).split('\b')
else:
hans = list(hans)
if len(hans) == 1:
return simple_seg(hans[0])
return list(chain(*[simple_seg(x) for x in hans]))


def seg(hans):
if getattr(seg, 'no_jieba', None):
ret = hans
return simple_seg(ret)

if seg.jieba is None:
try:
import jieba
Expand All @@ -142,9 +153,15 @@ def seg(hans):
seg.no_jieba = True
return seg(hans)
else:
ret = seg.jieba.cut(hans)
hans = simple_seg(hans)
ret = []
for x in hans:
if RE_NONE_HANS.match(x): # 没有拼音的字符,不再参与二次分词
ret.append(x)
else:
ret.extend(list(seg.jieba.cut(x)))
return ret

return simple_seg(ret)
seg.jieba = None
if os.environ.get('PYPINYIN_NO_JIEBA'):
seg.no_jieba = True
Expand Down
5 changes: 1 addition & 4 deletions tests/_test_env.py
Expand Up @@ -6,13 +6,10 @@
os.environ['PYPINYIN_NO_PHRASES'] = 'true'
os.environ['PYPINYIN_NO_JIEBA'] = 'true'

import pytest

import pypinyin
from .utils import has_module


def test_env():
assert pypinyin.PHRASES_DICT == {}
assert pypinyin.seg.no_jieba
assert pypinyin.seg('北京') == '北京'
assert pypinyin.seg('北京') == ['北京']
38 changes: 13 additions & 25 deletions tests/test_pinyin.py
Expand Up @@ -43,11 +43,7 @@ def test_pinyin_finals():
"""只包含韵母的词语"""
hans = '嗷嗷'
assert pinyin(hans) == [['\xe1o'], ['\xe1o']]
try:
assert pinyin(hans + 'abc') == [['\xe1o'], ['\xe1o'], ['abc']]
except AssertionError:
assert pinyin(hans + 'abc') == [['\xe1o'], ['\xe1o'],
['a'], ['b'], ['c']]
assert pinyin(hans + 'abc') == [['\xe1o'], ['\xe1o'], ['abc']]
assert pinyin(hans, NORMAL) == [['ao'], ['ao']]
assert pinyin(hans, TONE) == [['\xe1o'], ['\xe1o']]
assert pinyin(hans, TONE2) == [['a2o'], ['a2o']]
Expand All @@ -71,21 +67,14 @@ def test_zh_and_en():
"""中英文混合的情况"""
# 中英文
hans = '中心'
if has_module('jieba'):
assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']]
else:
assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'],
['a'], ['b'], ['c']]
assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']]
# 中英文混合的固定词组
assert pinyin('黄山B股', style=TONE2) == [['hua2ng'], ['sha1n'], ['B'], ['gu3']]
assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
assert pinyin('AB超C', style=TONE2) == [['A'], ['B'], ['cha1o'], ['C']]
if has_module('jieba'):
assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
else:
assert pinyin('AB阿C', style=TONE2) == [['A'], ['B'], ['a1'], ['C']]
assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
assert pinyin('维生素C', style=TONE2) == [['we2i'], ['she1ng'], ['su4'], ['C']]


Expand Down Expand Up @@ -120,16 +109,15 @@ def test_seg_jieba():
assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
assert pinyin('AB超C', style=TONE2) == [['A'], ['B'], ['cha1o'], ['C']]
assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
assert pinyin('维生素C', style=TONE2) == [['we2i'], ['she1ng'], ['su4'], ['C']]


@pytest.mark.skipif(not has_module('snownlp'), reason='cant import snownlp')
def test_other_seg_module():
hans = '音乐123'
assert lazy_pinyin(hans, style=TONE2) == [u'yi1n', u'le4', u'1', u'2', u'3']
from snownlp import SnowNLP
hans = '音乐123'
hans_seg = SnowNLP(hans).words
assert lazy_pinyin(hans_seg, style=TONE2) == [u'yi1n', u'yue4', u'123']

Expand Down Expand Up @@ -170,16 +158,16 @@ def test_errors():


def test_errors_callable():
def foobar(char):
return 'a'
def foobar(chars):
return 'a' * len(chars)

class Foobar(object):
def __call__(self, char):
return 'a'
def __call__(self, chars):
return 'a' * len(chars)

n = 5
assert lazy_pinyin('あ' * n, errors=foobar) == ['a'] * n
assert lazy_pinyin('あ' * n, errors=Foobar()) == ['a'] * n
assert lazy_pinyin('あ' * n, errors=foobar) == ['a' * n]
assert lazy_pinyin('あ' * n, errors=Foobar()) == ['a' * n]


def test_update():
Expand Down Expand Up @@ -221,4 +209,4 @@ def test_simple_seg():
hans = '你好にほんごРусский язык'
ret = 'ni3 ha3o'
errors = lambda x: None
assert slug(hans, style=TONE2, separator=' ', errors=errors)
assert slug(hans, style=TONE2, separator=' ', errors=errors) == ret

0 comments on commit 3d52fe8

Please sign in to comment.