From c837bfc46fe6208ea4b9f7f000f5bba19fa6f34a Mon Sep 17 00:00:00 2001 From: Dingyuan Wang Date: Fri, 23 Sep 2016 15:27:02 +0800 Subject: [PATCH] add TONE3 and BOPOMOFO styles --- docs/api.rst | 8 ++++-- pypinyin/__init__.py | 16 +++++++++-- pypinyin/constants.py | 57 +++++++++++++++++++++++++++++++++---- pypinyin/core.py | 26 +++++++++++++---- pypinyin/phonetic_symbol.py | 3 +- tests/test_pinyin.py | 37 +++++++++++++++++++++--- 6 files changed, 126 insertions(+), 21 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 062066a5..bd50f884 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -9,12 +9,16 @@ API ======================= ==== ========================================================================================= pypinyin.NORMAL 0 普通风格,不带声调。如: 中国 -> ``zhong guo`` pypinyin.TONE 1 声调风格1,拼音声调在韵母第一个字母上(默认风格)。如: 中国 -> ``zhōng guó`` -pypinyin.TONE2 2 声调风格2,即拼音声调在各个拼音之后,用数字 [0-4] 进行表示。如: 中国 -> ``zho1ng guo2`` +pypinyin.TONE2 2 声调风格2,即拼音声调在各个韵母之后,用数字 [1-4] 进行表示。如: 中国 -> ``zho1ng guo2`` +pypinyin.TONE3 8 声调风格2,即拼音声调在各个拼音之后,用数字 [1-4] 进行表示。如: 中国 -> ``zhong1 guo2`` pypinyin.INITIALS 3 声母风格,只返回各个拼音的声母部分。如: 中国 -> ``zh g`` pypinyin.FIRST_LETTER 4 首字母风格,只返回拼音的首字母部分。如: 中国 -> ``z g`` pypinyin.FINALS 5 韵母风格1,只返回各个拼音的韵母部分,不带声调。如: 中国 -> ``ong uo`` pypinyin.FINALS_TONE 6 韵母风格2,带声调,声调在韵母第一个字母上。如:中国 -> ``ōng uó`` -pypinyin.FINALS_TONE2 7 韵母风格2,带声调,声调在各个拼音之后,用数字 [0-4] 进行表示。如: 中国 -> ``o1ng uo2`` +pypinyin.FINALS_TONE2 7 韵母风格2,带声调,声调在各个韵母之后,用数字 [1-4] 进行表示。如: 中国 -> ``o1ng uo2`` +pypinyin.FINALS_TONE3 9 韵母风格3,带声调,声调在各个拼音之后,用数字 [1-4] 进行表示。如: 中国 -> ``o1ng uo2`` +pypinyin.BOPOMOFO 10 注音风格,带声调,阴平(第一声)不标。如: 中国 -> ``ㄓㄨㄥ ㄍㄨㄛˊ`` +pypinyin.BOPOMOFO_FIRST 11 注音风格,仅首字母。如: 中国 -> ``ㄓ ㄍ`` ======================= ==== ========================================================================================= diff --git a/pypinyin/__init__.py b/pypinyin/__init__.py index cb1ef694..4c4e7ef1 100644 --- a/pypinyin/__init__.py +++ b/pypinyin/__init__.py @@ -7,8 +7,10 @@ from .compat import PY2 from .constants import ( - STYLE_NORMAL, STYLE_TONE, STYLE_TONE2, STYLE_INITIALS, STYLE_FIRST_LETTER, - STYLE_FINALS, STYLE_FINALS_TONE, STYLE_FINALS_TONE2 + STYLE_NORMAL, STYLE_TONE, STYLE_TONE2, STYLE_TONE3, + STYLE_INITIALS, STYLE_FIRST_LETTER, + STYLE_FINALS, STYLE_FINALS_TONE, STYLE_FINALS_TONE2, STYLE_FINALS_TONE3, + STYLE_BOPOMOFO, STYLE_BOPOMOFO_FIRST ) from .core import ( # noqa pinyin, lazy_pinyin, slug, load_single_dict, load_phrases_dict @@ -17,11 +19,15 @@ NORMAL = STYLE_NORMAL TONE = STYLE_TONE TONE2 = STYLE_TONE2 +TONE3 = STYLE_TONE3 INITIALS = STYLE_INITIALS FIRST_LETTER = STYLE_FIRST_LETTER FINALS = STYLE_FINALS FINALS_TONE = STYLE_FINALS_TONE FINALS_TONE2 = STYLE_FINALS_TONE2 +FINALS_TONE3 = STYLE_FINALS_TONE3 +BOPOMOFO = STYLE_BOPOMOFO +BOPOMOFO_FIRST = STYLE_BOPOMOFO_FIRST __title__ = 'pypinyin' __version__ = '0.13.0' @@ -33,11 +39,15 @@ 'STYLE_NORMAL', 'NORMAL', 'STYLE_TONE', 'TONE', 'STYLE_TONE2', 'TONE2', + 'STYLE_TONE3', 'TONE3', 'STYLE_INITIALS', 'INITIALS', 'STYLE_FINALS', 'FINALS', 'STYLE_FINALS_TONE', 'FINALS_TONE', 'STYLE_FINALS_TONE2', 'FINALS_TONE2', - 'STYLE_FIRST_LETTER', 'FIRST_LETTER' + 'STYLE_FINALS_TONE3', 'FINALS_TONE3', + 'STYLE_FIRST_LETTER', 'FIRST_LETTER', + 'STYLE_BOPOMOFO', 'BOPOMOFO', + 'STYLE_BOPOMOFO_FIRST', 'BOPOMOFO_FIRST' ] if PY2: # fix "TypeError: Item in ``from list'' not a string" on Python 2 diff --git a/pypinyin/constants.py b/pypinyin/constants.py index b918e3b7..a47564f8 100644 --- a/pypinyin/constants.py +++ b/pypinyin/constants.py @@ -27,7 +27,9 @@ # 匹配带声调字符的正则表达式 RE_PHONETIC_SYMBOL = r'[' + re.escape(re_phonetic_symbol_source) + r']' # 匹配使用数字标识声调的字符的正则表达式 -RE_TONE2 = r'([aeoiuvnm])([0-4])$' +RE_TONE2 = r'([aeoiuvnm])([1-4])$' +# 匹配 TONE2 中标识韵母声调的正则表达式 +RE_TONE3 = re.compile('^([a-z]+)([1-4])([a-z]*)$') # 有拼音的汉字 if SUPPORT_UCS4: RE_HANS = re.compile( @@ -54,19 +56,25 @@ PINYIN_STYLE = { 'NORMAL': 0, # 普通风格,不带声调 'TONE': 1, # 标准风格,声调在韵母的第一个字母上 - 'TONE2': 2, # 声调在拼音之后,使用数字 1~4 标识 + 'TONE2': 2, # 声调在韵母之后,使用数字 1~4 标识 + 'TONE3': 8, # 声调在拼音之后,使用数字 1~4 标识 'INITIALS': 3, # 仅保留声母部分 'FIRST_LETTER': 4, # 仅保留首字母 'FINALS': 5, # 仅保留韵母部分,不带声调 'FINALS_TONE': 6, # 仅保留韵母部分,带声调 - 'FINALS_TONE2': 7, # 仅保留韵母部分,声调在拼音之后,使用数字 1~4 标识 + 'FINALS_TONE2': 7, # 仅保留韵母部分,声调在韵母之后,使用数字 1~4 标识 + 'FINALS_TONE3': 9, # 仅保留韵母部分,声调在拼音之后,使用数字 1~4 标识 + 'BOPOMOFO': 10, # 注音符号,带声调,阴平(第一声)不标 + 'BOPOMOFO_FIRST': 11, # 注音符号首字母 } # 普通风格,不带声调 NORMAL = STYLE_NORMAL = PINYIN_STYLE['NORMAL'] # 标准风格,声调在韵母的第一个字母上 TONE = STYLE_TONE = PINYIN_STYLE['TONE'] -# 声调在拼音之后,使用数字 1~4 标识 +# 声调在韵母之后,使用数字 1~4 标识 TONE2 = STYLE_TONE2 = PINYIN_STYLE['TONE2'] +# 声调在拼音之后,使用数字 1~4 标识 +TONE3 = STYLE_TONE3 = PINYIN_STYLE['TONE3'] # 仅保留声母部分 INITIALS = STYLE_INITIALS = PINYIN_STYLE['INITIALS'] # 仅保留首字母 @@ -75,8 +83,14 @@ FINALS = STYLE_FINALS = PINYIN_STYLE['FINALS'] # 仅保留韵母部分,带声调 FINALS_TONE = STYLE_FINALS_TONE = PINYIN_STYLE['FINALS_TONE'] -# 仅保留韵母部分,声调在拼音之后,使用数字 1~4 标识 +# 仅保留韵母部分,声调在韵母之后,使用数字 1~4 标识 FINALS_TONE2 = STYLE_FINALS_TONE2 = PINYIN_STYLE['FINALS_TONE2'] +# 仅保留韵母部分,声调在拼音之后,使用数字 1~4 标识 +FINALS_TONE3 = STYLE_FINALS_TONE3 = PINYIN_STYLE['FINALS_TONE3'] +# 注音符号,带声调,阴平(第一声)不标 +BOPOMOFO = STYLE_BOPOMOFO = PINYIN_STYLE['BOPOMOFO'] +# 注音符号首字母 +BOPOMOFO_FIRST = STYLE_BOPOMOFO_FIRST = PINYIN_STYLE['BOPOMOFO_FIRST'] U_FINALS_EXCEPTIONS_MAP = { u'ū': u'ǖ', @@ -84,3 +98,36 @@ u'ǔ': u'ǚ', u'ù': u'ǜ', } + +BOPOMOFO_REPLACE = ( + (re.compile('^m(\d)$'), 'mu\\1'), # 呣 + (re.compile('^n(\d)$'), 'N\\1'), # 嗯 + (re.compile('^r5$'), 'er5'), # 〜兒 + (re.compile('iu'), 'iou'), + (re.compile('ui'), 'uei'), + (re.compile('ong'), 'ung'), + (re.compile('^yi?'), 'i'), + (re.compile('^wu?'), 'u'), + (re.compile('iu'), 'v'), + (re.compile('^([jqx])u'), '\\1v'), + (re.compile('([iuv])n'), '\\1en'), + (re.compile('^zhi?'), 'Z'), + (re.compile('^chi?'), 'C'), + (re.compile('^shi?'), 'S'), + (re.compile('^([zcsr])i'), '\\1'), + (re.compile('ai'), 'A'), + (re.compile('ei'), 'I'), + (re.compile('ao'), 'O'), + (re.compile('ou'), 'U'), + (re.compile('ang'), 'K'), + (re.compile('eng'), 'G'), + (re.compile('an'), 'M'), + (re.compile('en'), 'N'), + (re.compile('er'), 'R'), + (re.compile('eh'), 'E'), + (re.compile('([iv])e'), '\\1E'), + (re.compile('([^0-4])$'), '\\g<1>0'), + (re.compile('1$'), ''), +) +BOPOMOFO_TABLE = dict(zip('bpmfdtnlgkhjqxZCSrzcsiuvaoeEAIOUMNKGR2340', + 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄧㄨㄩㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦˊˇˋ˙')) diff --git a/pypinyin/core.py b/pypinyin/core.py index dd4c45a4..aa4c9882 100644 --- a/pypinyin/core.py +++ b/pypinyin/core.py @@ -12,9 +12,11 @@ from .compat import text_type, callable_check from .constants import ( PHRASES_DICT, PINYIN_DICT, _INITIALS, PHONETIC_SYMBOL, RE_PHONETIC_SYMBOL, - RE_TONE2, RE_HANS, U_FINALS_EXCEPTIONS_MAP, - NORMAL, TONE, TONE2, INITIALS, FIRST_LETTER, - FINALS, FINALS_TONE, FINALS_TONE2 + RE_TONE2, RE_TONE3, RE_HANS, U_FINALS_EXCEPTIONS_MAP, + BOPOMOFO_REPLACE, BOPOMOFO_TABLE, + NORMAL, TONE, TONE2, TONE3, INITIALS, FIRST_LETTER, + FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3, + BOPOMOFO, BOPOMOFO_FIRST ) from .utils import simple_seg, _replace_tone2_style_dict_to_default @@ -152,7 +154,8 @@ def _replace(m): else: return re.sub(RE_TONE2, r'\1', PHONETIC_SYMBOL[symbol]) # 使用数字标识声调 - elif style in [TONE2, FINALS_TONE2]: + elif style in [TONE2, TONE3, FINALS_TONE2, FINALS_TONE3, + BOPOMOFO, BOPOMOFO_FIRST]: # 返回使用数字标识声调的字符 return PHONETIC_SYMBOL[symbol] # 声调在头上 @@ -161,17 +164,28 @@ def _replace(m): # 替换拼音中的带声调字符 py = re.sub(RE_PHONETIC_SYMBOL, _replace, pinyin) + # 将声调移动到最后 + if style in [TONE3, FINALS_TONE3, BOPOMOFO, BOPOMOFO_FIRST]: + py = RE_TONE3.sub(r'\1\3\2', py) # 首字母 if style == FIRST_LETTER: py = py[0] # 韵母 - elif style in [FINALS, FINALS_TONE, FINALS_TONE2]: - # 不处理鼻音: 'ḿ', 'ń', 'ň', 'ǹ ' + elif style in [FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3]: + # 不处理鼻音: 'ḿ', 'ń', 'ň', 'ǹ' if pinyin and pinyin[0] not in [ '\u1e3f', '\u0144', '\u0148', '\u01f9' ]: py = final(py) + # 声调在拼音之后、注音 + elif style in [BOPOMOFO, BOPOMOFO_FIRST]: + # 查表替换成注音 + for f, r in BOPOMOFO_REPLACE: + py = f.sub(r, py) + py = ''.join(BOPOMOFO_TABLE.get(x, x) for x in py) + if style == BOPOMOFO_FIRST: + py = py[0] return py diff --git a/pypinyin/phonetic_symbol.py b/pypinyin/phonetic_symbol.py index 6e008a42..318d613e 100644 --- a/pypinyin/phonetic_symbol.py +++ b/pypinyin/phonetic_symbol.py @@ -25,7 +25,8 @@ "ú": "u2", "ǔ": "u3", "ù": "u4", - "ü": "v0", + # üe + "ü": "v", "ǘ": "v2", "ǚ": "v3", "ǜ": "v4", diff --git a/tests/test_pinyin.py b/tests/test_pinyin.py index 4fdd0fc6..b71fe4c5 100644 --- a/tests/test_pinyin.py +++ b/tests/test_pinyin.py @@ -7,8 +7,9 @@ from pypinyin import ( pinyin, slug, lazy_pinyin, load_single_dict, - load_phrases_dict, NORMAL, TONE, TONE2, INITIALS, - FIRST_LETTER, FINALS, FINALS_TONE, FINALS_TONE2 + load_phrases_dict, NORMAL, TONE, TONE2, TONE3, INITIALS, + FIRST_LETTER, FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3, + BOPOMOFO, BOPOMOFO_FIRST ) from pypinyin.compat import SUPPORT_UCS4 from .utils import has_module @@ -23,12 +24,18 @@ def test_pinyin_initials(): assert pinyin(hans, NORMAL) == [['zhong'], ['xin']] # 声调风格,拼音声调在韵母第一个字母上 assert pinyin(hans, TONE) == [['zh\u014dng'], ['x\u012bn']] - # 声调风格2,即拼音声调在各个拼音之后,用数字 [0-4] 进行表示 + # 声调风格2,即拼音声调在各个声母之后,用数字 [1-4] 进行表示 assert pinyin(hans, TONE2) == [['zho1ng'], ['xi1n']] + # 声调风格3,即拼音声调在各个拼音之后,用数字 [1-4] 进行表示 + assert pinyin(hans, TONE3) == [['zhong1'], ['xin1']] # 声母风格,只返回各个拼音的声母部分 assert pinyin(hans, INITIALS) == [['zh'], ['x']] # 首字母风格,只返回拼音的首字母部分 assert pinyin(hans, FIRST_LETTER) == [['z'], ['x']] + # 注音风格,带声调 + assert pinyin(hans, BOPOMOFO) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']] + # 注音风格,首字母 + assert pinyin(hans, BOPOMOFO_FIRST) == [['ㄓ'], ['ㄒ']] # 启用多音字模式 assert pinyin(hans, heteronym=True) == [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']] @@ -36,8 +43,10 @@ def test_pinyin_initials(): assert pinyin(hans, style=FINALS) == [['ong'], ['in']] # 韵母风格2,带声调,声调在韵母第一个字母上 assert pinyin(hans, style=FINALS_TONE) == [['\u014dng'], ['\u012bn']] - # 韵母风格2,带声调,声调在各个拼音之后,用数字 [0-4] 进行表示 + # 韵母风格2,带声调,声调在各个声母之后,用数字 [1-4] 进行表示 assert pinyin(hans, style=FINALS_TONE2) == [['o1ng'], ['i1n']] + # 韵母风格3,带声调,声调在各个拼音之后,用数字 [1-4] 进行表示 + assert pinyin(hans, style=FINALS_TONE3) == [['ong1'], ['in1']] def test_pinyin_finals(): @@ -48,14 +57,18 @@ def test_pinyin_finals(): assert pinyin(hans, NORMAL) == [['ao'], ['ao']] assert pinyin(hans, TONE) == [['\xe1o'], ['\xe1o']] assert pinyin(hans, TONE2) == [['a2o'], ['a2o']] + assert pinyin(hans, TONE3) == [['ao2'], ['ao2']] assert pinyin(hans, INITIALS) == [[''], ['']] assert pinyin(hans, FIRST_LETTER) == [['a'], ['a']] + assert pinyin(hans, BOPOMOFO) == [['ㄠˊ'], ['ㄠˊ']] + assert pinyin(hans, BOPOMOFO_FIRST) == [['ㄠ'], ['ㄠ']] assert pinyin(hans, heteronym=True) == [['\xe1o'], ['\xe1o']] assert pinyin('啊', heteronym=True) == \ [['a', 'è', 'ā', 'á', 'ǎ', 'à']] assert pinyin(hans, style=FINALS) == [['ao'], ['ao']] assert pinyin(hans, style=FINALS_TONE) == [['\xe1o'], ['\xe1o']] assert pinyin(hans, style=FINALS_TONE2) == [['a2o'], ['a2o']] + assert pinyin(hans, style=FINALS_TONE3) == [['ao2'], ['ao2']] def test_slug(): @@ -99,6 +112,7 @@ def test_lazy_pinyin(): assert lazy_pinyin('中心') == ['zhong', 'xin'] assert lazy_pinyin('中心', style=TONE) == ['zh\u014dng', 'x\u012bn'] assert lazy_pinyin('中心', style=INITIALS) == ['zh', 'x'] + assert lazy_pinyin('中心', style=BOPOMOFO) == ['ㄓㄨㄥ', 'ㄒㄧㄣ'] @pytest.mark.skipif(not has_module('jieba'), reason='cant import jieba') @@ -241,31 +255,41 @@ def test_simple_seg(): # 误把 yu 放到声母列表了 ['鱼', {'style': TONE2}, ['yu2']], ['鱼', {'style': FINALS}, ['v']], + ['鱼', {'style': BOPOMOFO}, ['ㄩˊ']], ['雨', {'style': TONE2}, ['yu3']], ['雨', {'style': FINALS}, ['v']], + ['雨', {'style': BOPOMOFO}, ['ㄩˇ']], ['元', {'style': TONE2}, ['yua2n']], ['元', {'style': FINALS}, ['van']], + ['元', {'style': BOPOMOFO}, ['ㄩㄢˊ']], # y, w 也不是拼音, yu的韵母是v, yi的韵母是i, wu的韵母是u ['呀', {'style': INITIALS}, ['']], ['呀', {'style': TONE2}, ['ya']], ['呀', {'style': FINALS}, ['ia']], + ['呀', {'style': BOPOMOFO}, ['ㄧㄚ˙']], ['无', {'style': INITIALS}, ['']], ['无', {'style': TONE2}, ['wu2']], ['无', {'style': FINALS}, ['u']], + ['无', {'style': BOPOMOFO}, ['ㄨˊ']], ['衣', {'style': TONE2}, ['yi1']], ['衣', {'style': FINALS}, ['i']], + ['衣', {'style': BOPOMOFO}, ['ㄧ']], ['万', {'style': TONE2}, ['wa4n']], ['万', {'style': FINALS}, ['uan']], + ['万', {'style': BOPOMOFO}, ['ㄨㄢˋ']], # ju, qu, xu 的韵母应该是 v ['具', {'style': FINALS_TONE}, ['ǜ']], ['具', {'style': FINALS_TONE2}, ['v4']], ['具', {'style': FINALS}, ['v']], + ['具', {'style': BOPOMOFO}, ['ㄐㄩˋ']], ['取', {'style': FINALS_TONE}, ['ǚ']], ['取', {'style': FINALS_TONE2}, ['v3']], ['取', {'style': FINALS}, ['v']], + ['取', {'style': BOPOMOFO}, ['ㄑㄩˇ']], ['徐', {'style': FINALS_TONE}, ['ǘ']], ['徐', {'style': FINALS_TONE2}, ['v2']], ['徐', {'style': FINALS}, ['v']], + ['徐', {'style': BOPOMOFO}, ['ㄒㄩˊ']], # ń ['嗯', {'style': NORMAL}, ['n']], ['嗯', {'style': TONE}, ['ń']], @@ -275,6 +299,7 @@ def test_simple_seg(): ['嗯', {'style': FINALS}, ['n']], ['嗯', {'style': FINALS_TONE}, ['ń']], ['嗯', {'style': FINALS_TONE2}, ['n2']], + ['嗯', {'style': BOPOMOFO}, ['ㄣˊ']], # ḿ \u1e3f U+1E3F ['呣', {'style': NORMAL}, ['m']], ['呣', {'style': TONE}, ['ḿ']], @@ -284,8 +309,12 @@ def test_simple_seg(): ['呣', {'style': FINALS}, ['m']], ['呣', {'style': FINALS_TONE}, ['ḿ']], ['呣', {'style': FINALS_TONE2}, ['m2']], + ['呣', {'style': BOPOMOFO}, ['ㄇㄨˊ']], # 41 ['彷徨', {}, ['pang', 'huang']], + # 注音 + ['打量', {'style': BOPOMOFO}, ['ㄉㄚˇ', 'ㄌㄧㄤ˙']], + ['黄山b股', {'style': BOPOMOFO}, ['ㄏㄨㄤˊ', 'ㄕㄢ', 'b', 'ㄍㄨˇ']], ]