Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

支持 U+20000 ~ U+2FA1F 区间内的汉字 #33

Merged
merged 3 commits into from Jan 2, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -42,3 +42,4 @@ tools/words.txt
tools/phrases_dict.txt
venv
.cache/
2.7/
7 changes: 7 additions & 0 deletions CHANGELOG.rst
@@ -1,6 +1,12 @@
Changelog
---------

0.10.0 (2016-mm-dd)
++++++++++++++++++++

* **[New]** Python 3.3+ 以上版本默认支持 `U+20000 ~ U+2FA1F` 区间内的汉字(详见 `#33`_)


0.9.5 (2015-12-19)
++++++++++++++++++++

Expand Down Expand Up @@ -270,3 +276,4 @@ __ https://github.com/mozillazg/python-pinyin/issues/8
.. _汉语拼音 - 维基百科: https://zh.wikipedia.org/wiki/%E6%B1%89%E8%AF%AD%E6%8B%BC%E9%9F%B3#cite_ref-10
.. _@xulin97: https://github.com/xulin97
.. _#31: https://github.com/mozillazg/python-pinyin/issues/31
.. _#33: https://github.com/mozillazg/python-pinyin/pull/33
26 changes: 22 additions & 4 deletions pypinyin/__init__.py
Expand Up @@ -12,6 +12,7 @@
import sys

from . import phonetic_symbol, pinyin_dict
from .compat import SUPPORT_UCS4

__title__ = 'pypinyin'
__version__ = '0.9.5'
Expand Down Expand Up @@ -58,9 +59,26 @@
# 匹配使用数字标识声调的字符的正则表达式
RE_TONE2 = r'([aeoiuvnm])([0-4])$'
# 有拼音的汉字
RE_HANS = re.compile(r'^(?:[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])+$')
# 没有拼音的字符
RE_NONE_HANS = re.compile(r'^(?:[^\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])+$')
if SUPPORT_UCS4:
RE_HANS = re.compile(
r'^(?:['
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF]
r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F]
r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D]
r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F]
r'])+$'
)
else:
RE_HANS = re.compile(
r'^(?:['
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'])+$'
)

# 拼音风格
PINYIN_STYLE = {
Expand Down Expand Up @@ -154,7 +172,7 @@ def seg(hans):
hans = simple_seg(hans)
ret = []
for x in hans:
if RE_NONE_HANS.match(x): # 没有拼音的字符,不再参与二次分词
if not RE_HANS.match(x): # 没有拼音的字符,不再参与二次分词
ret.append(x)
else:
ret.extend(list(seg.jieba.cut(x)))
Expand Down
7 changes: 7 additions & 0 deletions pypinyin/compat.py
@@ -0,0 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals


SUPPORT_UCS4 = len('\U00020000') == 1
5 changes: 1 addition & 4 deletions pypinyin/pinyin_dict.py
Expand Up @@ -28524,10 +28524,7 @@
# 0xFAFD: '', #
# 0xFAFE: '', #
# 0xFAFF: '', #
}

# Python 中高位(FFFF 以上) Unicode 字符的长度不再是1而是2,所以暂不处理。
'''
# CJK 扩展 B:[20000-2A6DF]
0x20000: 'hē', # 𠀀
0x20001: 'qī', # 𠀁
Expand Down Expand Up @@ -76181,4 +76178,4 @@
# 0x2FA1D: '', # 𪘀
# 0x2FA1E: '', #
# 0x2FA1F: '', #
'''
}
29 changes: 28 additions & 1 deletion tests/test_pinyin.py
Expand Up @@ -10,6 +10,7 @@
load_phrases_dict, NORMAL, TONE, TONE2, INITIALS,
FIRST_LETTER, FINALS, FINALS_TONE, FINALS_TONE2
)
from pypinyin.compat import SUPPORT_UCS4
from .utils import has_module


Expand Down Expand Up @@ -285,6 +286,32 @@ def test_simple_seg():
]


@pytest.mark.parametrize('hans, kwargs,result', data_for_update)
@pytest.mark.parametrize('hans, kwargs, result', data_for_update)
def test_update(hans, kwargs, result):
assert lazy_pinyin(hans, **kwargs) == result


@pytest.mark.skipif(not SUPPORT_UCS4, reason='dont support ucs4')
@pytest.mark.parametrize(
'han, result', [
['\U00020000', ['he']], # CJK 扩展 B:[20000-2A6DF]
['\U0002A703', ['ga']], # CJK 扩展 C:[2A700-2B73F]
['\U0002B740', ['wu']], # CJK 扩展 D:[2B740-2B81D]
['\U0002F80A', ['seng']], # CJK 兼容扩展:[2F800-2FA1F]
]
)
def test_support_ucs4(han, result):
assert lazy_pinyin(han) == result


@pytest.mark.skipif(SUPPORT_UCS4, reason='support ucs4')
@pytest.mark.parametrize(
'han', [
'\U00020000', # CJK 扩展 B:[20000-2A6DF]
'\U0002A703', # CJK 扩展 C:[2A700-2B73F]
'\U0002B740', # CJK 扩展 D:[2B740-2B81D]
'\U0002F80A', # CJK 兼容扩展:[2F800-2FA1F]
]
)
def test_dont_support_ucs4(han):
assert pinyin(han) == [[han]]