mozillazg · mozillazg · Jan 2, 2016 · Jan 2, 2016 · Jan 2, 2016 · Jan 2, 2016
diff --git a/.gitignore b/.gitignore
@@ -42,3 +42,4 @@ tools/words.txt
 tools/phrases_dict.txt
 venv
 .cache/
+2.7/
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,12 @@
 Changelog
 ---------
 
+0.10.0 (2016-mm-dd)
+++++++++++++++++++++
+
+* **[New]** Python 3.3+ 以上版本默认支持 `U+20000 ~ U+2FA1F` 区间内的汉字(详见 `#33`_)
+
+
 0.9.5 (2015-12-19)
 ++++++++++++++++++++
 
@@ -270,3 +276,4 @@ __ https://github.com/mozillazg/python-pinyin/issues/8
 .. _汉语拼音 - 维基百科: https://zh.wikipedia.org/wiki/%E6%B1%89%E8%AF%AD%E6%8B%BC%E9%9F%B3#cite_ref-10
 .. _@xulin97: https://github.com/xulin97
 .. _#31: https://github.com/mozillazg/python-pinyin/issues/31
+.. _#33: https://github.com/mozillazg/python-pinyin/pull/33
diff --git a/pypinyin/__init__.py b/pypinyin/__init__.py
@@ -12,6 +12,7 @@
 import sys
 
 from . import phonetic_symbol, pinyin_dict
+from .compat import SUPPORT_UCS4
 
 __title__ = 'pypinyin'
 __version__ = '0.9.5'
@@ -58,9 +59,26 @@
 # 匹配使用数字标识声调的字符的正则表达式
 RE_TONE2 = r'([aeoiuvnm])([0-4])$'
 # 有拼音的汉字
-RE_HANS = re.compile(r'^(?:[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])+$')
-# 没有拼音的字符
-RE_NONE_HANS = re.compile(r'^(?:[^\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])+$')
+if SUPPORT_UCS4:
+    RE_HANS = re.compile(
+        r'^(?:['
+        r'\u3400-\u4dbf'           # CJK扩展A:[3400-4DBF]
+        r'\u4e00-\u9fff'           # CJK基本:[4E00-9FFF]
+        r'\uf900-\ufaff'           # CJK兼容:[F900-FAFF]
+        r'\U00020000-\U0002A6DF'   # CJK扩展B:[20000-2A6DF]
+        r'\U0002A703-\U0002B73F'   # CJK扩展C:[2A700-2B73F]
+        r'\U0002B740-\U0002B81D'   # CJK扩展D:[2B740-2B81D]
+        r'\U0002F80A-\U0002FA1F'   # CJK兼容扩展:[2F800-2FA1F]
+        r'])+$'
+    )
+else:
+    RE_HANS = re.compile(
+        r'^(?:['
+        r'\u3400-\u4dbf'           # CJK扩展A:[3400-4DBF]
+        r'\u4e00-\u9fff'           # CJK基本:[4E00-9FFF]
+        r'\uf900-\ufaff'           # CJK兼容:[F900-FAFF]
+        r'])+$'
+    )
 
 # 拼音风格
 PINYIN_STYLE = {
@@ -154,7 +172,7 @@ def seg(hans):
         hans = simple_seg(hans)
         ret = []
         for x in hans:
-            if RE_NONE_HANS.match(x):   # 没有拼音的字符，不再参与二次分词
+            if not RE_HANS.match(x):   # 没有拼音的字符，不再参与二次分词
                 ret.append(x)
             else:
                 ret.extend(list(seg.jieba.cut(x)))

diff --git a/pypinyin/compat.py b/pypinyin/compat.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+
+SUPPORT_UCS4 = len('\U00020000') == 1
diff --git a/pypinyin/pinyin_dict.py b/pypinyin/pinyin_dict.py
@@ -28524,10 +28524,7 @@
     # 0xFAFD: '',  #
     # 0xFAFE: '',  #
     # 0xFAFF: '',  #
-}
 
-# Python 中高位(FFFF 以上) Unicode 字符的长度不再是1而是2，所以暂不处理。
-'''
     # CJK 扩展 B:[20000-2A6DF]
     0x20000: 'hē',  # 𠀀
     0x20001: 'qī',  # 𠀁
@@ -76181,4 +76178,4 @@
     # 0x2FA1D: '',  # 𪘀
     # 0x2FA1E: '',  #
     # 0x2FA1F: '',  #
-'''
+}
diff --git a/tests/test_pinyin.py b/tests/test_pinyin.py
@@ -10,6 +10,7 @@
     load_phrases_dict, NORMAL, TONE, TONE2, INITIALS,
     FIRST_LETTER, FINALS, FINALS_TONE, FINALS_TONE2
 )
+from pypinyin.compat import SUPPORT_UCS4
 from .utils import has_module
 
 
@@ -285,6 +286,32 @@ def test_simple_seg():
 ]
 
 
-@pytest.mark.parametrize('hans, kwargs,result', data_for_update)
+@pytest.mark.parametrize('hans, kwargs, result', data_for_update)
 def test_update(hans, kwargs, result):
     assert lazy_pinyin(hans, **kwargs) == result
+
+
+@pytest.mark.skipif(not SUPPORT_UCS4, reason='dont support ucs4')
+@pytest.mark.parametrize(
+    'han, result', [
+        ['\U00020000', ['he']],      # CJK 扩展 B:[20000-2A6DF]
+        ['\U0002A703', ['ga']],      # CJK 扩展 C:[2A700-2B73F]
+        ['\U0002B740', ['wu']],      # CJK 扩展 D:[2B740-2B81D]
+        ['\U0002F80A', ['seng']],    # CJK 兼容扩展:[2F800-2FA1F]
+    ]
+)
+def test_support_ucs4(han, result):
+    assert lazy_pinyin(han) == result
+
+
+@pytest.mark.skipif(SUPPORT_UCS4, reason='support ucs4')
+@pytest.mark.parametrize(
+    'han', [
+        '\U00020000',      # CJK 扩展 B:[20000-2A6DF]
+        '\U0002A703',      # CJK 扩展 C:[2A700-2B73F]
+        '\U0002B740',      # CJK 扩展 D:[2B740-2B81D]
+        '\U0002F80A',      # CJK 兼容扩展:[2F800-2FA1F]
+    ]
+)
+def test_dont_support_ucs4(han):
+    assert pinyin(han) == [[han]]