Skip to content

Commit

Permalink
add TONE3 and BOPOMOFO styles
Browse files Browse the repository at this point in the history
  • Loading branch information
gumblex committed Sep 23, 2016
1 parent 93b653f commit c837bfc
Show file tree
Hide file tree
Showing 6 changed files with 126 additions and 21 deletions.
8 changes: 6 additions & 2 deletions docs/api.rst
Expand Up @@ -9,12 +9,16 @@ API
======================= ==== =========================================================================================
pypinyin.NORMAL 0 普通风格,不带声调。如: 中国 -> ``zhong guo``
pypinyin.TONE 1 声调风格1,拼音声调在韵母第一个字母上(默认风格)。如: 中国 -> ``zhōng guó``
pypinyin.TONE2 2 声调风格2,即拼音声调在各个拼音之后,用数字 [0-4] 进行表示。如: 中国 -> ``zho1ng guo2``
pypinyin.TONE2 2 声调风格2,即拼音声调在各个韵母之后,用数字 [1-4] 进行表示。如: 中国 -> ``zho1ng guo2``
pypinyin.TONE3 8 声调风格2,即拼音声调在各个拼音之后,用数字 [1-4] 进行表示。如: 中国 -> ``zhong1 guo2``
pypinyin.INITIALS 3 声母风格,只返回各个拼音的声母部分。如: 中国 -> ``zh g``
pypinyin.FIRST_LETTER 4 首字母风格,只返回拼音的首字母部分。如: 中国 -> ``z g``
pypinyin.FINALS 5 韵母风格1,只返回各个拼音的韵母部分,不带声调。如: 中国 -> ``ong uo``
pypinyin.FINALS_TONE 6 韵母风格2,带声调,声调在韵母第一个字母上。如:中国 -> ``ōng uó``
pypinyin.FINALS_TONE2 7 韵母风格2,带声调,声调在各个拼音之后,用数字 [0-4] 进行表示。如: 中国 -> ``o1ng uo2``
pypinyin.FINALS_TONE2 7 韵母风格2,带声调,声调在各个韵母之后,用数字 [1-4] 进行表示。如: 中国 -> ``o1ng uo2``
pypinyin.FINALS_TONE3 9 韵母风格3,带声调,声调在各个拼音之后,用数字 [1-4] 进行表示。如: 中国 -> ``o1ng uo2``
pypinyin.BOPOMOFO 10 注音风格,带声调,阴平(第一声)不标。如: 中国 -> ``ㄓㄨㄥ ㄍㄨㄛˊ``
pypinyin.BOPOMOFO_FIRST 11 注音风格,仅首字母。如: 中国 -> ``ㄓ ㄍ``
======================= ==== =========================================================================================


Expand Down
16 changes: 13 additions & 3 deletions pypinyin/__init__.py
Expand Up @@ -7,8 +7,10 @@

from .compat import PY2
from .constants import (
STYLE_NORMAL, STYLE_TONE, STYLE_TONE2, STYLE_INITIALS, STYLE_FIRST_LETTER,
STYLE_FINALS, STYLE_FINALS_TONE, STYLE_FINALS_TONE2
STYLE_NORMAL, STYLE_TONE, STYLE_TONE2, STYLE_TONE3,
STYLE_INITIALS, STYLE_FIRST_LETTER,
STYLE_FINALS, STYLE_FINALS_TONE, STYLE_FINALS_TONE2, STYLE_FINALS_TONE3,
STYLE_BOPOMOFO, STYLE_BOPOMOFO_FIRST
)
from .core import ( # noqa
pinyin, lazy_pinyin, slug, load_single_dict, load_phrases_dict
Expand All @@ -17,11 +19,15 @@
NORMAL = STYLE_NORMAL
TONE = STYLE_TONE
TONE2 = STYLE_TONE2
TONE3 = STYLE_TONE3
INITIALS = STYLE_INITIALS
FIRST_LETTER = STYLE_FIRST_LETTER
FINALS = STYLE_FINALS
FINALS_TONE = STYLE_FINALS_TONE
FINALS_TONE2 = STYLE_FINALS_TONE2
FINALS_TONE3 = STYLE_FINALS_TONE3
BOPOMOFO = STYLE_BOPOMOFO
BOPOMOFO_FIRST = STYLE_BOPOMOFO_FIRST

__title__ = 'pypinyin'
__version__ = '0.13.0'
Expand All @@ -33,11 +39,15 @@
'STYLE_NORMAL', 'NORMAL',
'STYLE_TONE', 'TONE',
'STYLE_TONE2', 'TONE2',
'STYLE_TONE3', 'TONE3',
'STYLE_INITIALS', 'INITIALS',
'STYLE_FINALS', 'FINALS',
'STYLE_FINALS_TONE', 'FINALS_TONE',
'STYLE_FINALS_TONE2', 'FINALS_TONE2',
'STYLE_FIRST_LETTER', 'FIRST_LETTER'
'STYLE_FINALS_TONE3', 'FINALS_TONE3',
'STYLE_FIRST_LETTER', 'FIRST_LETTER',
'STYLE_BOPOMOFO', 'BOPOMOFO',
'STYLE_BOPOMOFO_FIRST', 'BOPOMOFO_FIRST'
]
if PY2:
# fix "TypeError: Item in ``from list'' not a string" on Python 2
Expand Down
57 changes: 52 additions & 5 deletions pypinyin/constants.py
Expand Up @@ -27,7 +27,9 @@
# 匹配带声调字符的正则表达式
RE_PHONETIC_SYMBOL = r'[' + re.escape(re_phonetic_symbol_source) + r']'
# 匹配使用数字标识声调的字符的正则表达式
RE_TONE2 = r'([aeoiuvnm])([0-4])$'
RE_TONE2 = r'([aeoiuvnm])([1-4])$'
# 匹配 TONE2 中标识韵母声调的正则表达式
RE_TONE3 = re.compile('^([a-z]+)([1-4])([a-z]*)$')
# 有拼音的汉字
if SUPPORT_UCS4:
RE_HANS = re.compile(
Expand All @@ -54,19 +56,25 @@
PINYIN_STYLE = {
'NORMAL': 0, # 普通风格,不带声调
'TONE': 1, # 标准风格,声调在韵母的第一个字母上
'TONE2': 2, # 声调在拼音之后,使用数字 1~4 标识
'TONE2': 2, # 声调在韵母之后,使用数字 1~4 标识
'TONE3': 8, # 声调在拼音之后,使用数字 1~4 标识
'INITIALS': 3, # 仅保留声母部分
'FIRST_LETTER': 4, # 仅保留首字母
'FINALS': 5, # 仅保留韵母部分,不带声调
'FINALS_TONE': 6, # 仅保留韵母部分,带声调
'FINALS_TONE2': 7, # 仅保留韵母部分,声调在拼音之后,使用数字 1~4 标识
'FINALS_TONE2': 7, # 仅保留韵母部分,声调在韵母之后,使用数字 1~4 标识
'FINALS_TONE3': 9, # 仅保留韵母部分,声调在拼音之后,使用数字 1~4 标识
'BOPOMOFO': 10, # 注音符号,带声调,阴平(第一声)不标
'BOPOMOFO_FIRST': 11, # 注音符号首字母
}
# 普通风格,不带声调
NORMAL = STYLE_NORMAL = PINYIN_STYLE['NORMAL']
# 标准风格,声调在韵母的第一个字母上
TONE = STYLE_TONE = PINYIN_STYLE['TONE']
# 声调在拼音之后,使用数字 1~4 标识
# 声调在韵母之后,使用数字 1~4 标识
TONE2 = STYLE_TONE2 = PINYIN_STYLE['TONE2']
# 声调在拼音之后,使用数字 1~4 标识
TONE3 = STYLE_TONE3 = PINYIN_STYLE['TONE3']
# 仅保留声母部分
INITIALS = STYLE_INITIALS = PINYIN_STYLE['INITIALS']
# 仅保留首字母
Expand All @@ -75,12 +83,51 @@
FINALS = STYLE_FINALS = PINYIN_STYLE['FINALS']
# 仅保留韵母部分,带声调
FINALS_TONE = STYLE_FINALS_TONE = PINYIN_STYLE['FINALS_TONE']
# 仅保留韵母部分,声调在拼音之后,使用数字 1~4 标识
# 仅保留韵母部分,声调在韵母之后,使用数字 1~4 标识
FINALS_TONE2 = STYLE_FINALS_TONE2 = PINYIN_STYLE['FINALS_TONE2']
# 仅保留韵母部分,声调在拼音之后,使用数字 1~4 标识
FINALS_TONE3 = STYLE_FINALS_TONE3 = PINYIN_STYLE['FINALS_TONE3']
# 注音符号,带声调,阴平(第一声)不标
BOPOMOFO = STYLE_BOPOMOFO = PINYIN_STYLE['BOPOMOFO']
# 注音符号首字母
BOPOMOFO_FIRST = STYLE_BOPOMOFO_FIRST = PINYIN_STYLE['BOPOMOFO_FIRST']

U_FINALS_EXCEPTIONS_MAP = {
u'ū': u'ǖ',
u'ú': u'ǘ',
u'ǔ': u'ǚ',
u'ù': u'ǜ',
}

BOPOMOFO_REPLACE = (
(re.compile('^m(\d)$'), 'mu\\1'), # 呣
(re.compile('^n(\d)$'), 'N\\1'), # 嗯
(re.compile('^r5$'), 'er5'), # 〜兒
(re.compile('iu'), 'iou'),
(re.compile('ui'), 'uei'),
(re.compile('ong'), 'ung'),
(re.compile('^yi?'), 'i'),
(re.compile('^wu?'), 'u'),
(re.compile('iu'), 'v'),
(re.compile('^([jqx])u'), '\\1v'),
(re.compile('([iuv])n'), '\\1en'),
(re.compile('^zhi?'), 'Z'),
(re.compile('^chi?'), 'C'),
(re.compile('^shi?'), 'S'),
(re.compile('^([zcsr])i'), '\\1'),
(re.compile('ai'), 'A'),
(re.compile('ei'), 'I'),
(re.compile('ao'), 'O'),
(re.compile('ou'), 'U'),
(re.compile('ang'), 'K'),
(re.compile('eng'), 'G'),
(re.compile('an'), 'M'),
(re.compile('en'), 'N'),
(re.compile('er'), 'R'),
(re.compile('eh'), 'E'),
(re.compile('([iv])e'), '\\1E'),
(re.compile('([^0-4])$'), '\\g<1>0'),
(re.compile('1$'), ''),
)
BOPOMOFO_TABLE = dict(zip('bpmfdtnlgkhjqxZCSrzcsiuvaoeEAIOUMNKGR2340',
'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄧㄨㄩㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦˊˇˋ˙'))
26 changes: 20 additions & 6 deletions pypinyin/core.py
Expand Up @@ -12,9 +12,11 @@
from .compat import text_type, callable_check
from .constants import (
PHRASES_DICT, PINYIN_DICT, _INITIALS, PHONETIC_SYMBOL, RE_PHONETIC_SYMBOL,
RE_TONE2, RE_HANS, U_FINALS_EXCEPTIONS_MAP,
NORMAL, TONE, TONE2, INITIALS, FIRST_LETTER,
FINALS, FINALS_TONE, FINALS_TONE2
RE_TONE2, RE_TONE3, RE_HANS, U_FINALS_EXCEPTIONS_MAP,
BOPOMOFO_REPLACE, BOPOMOFO_TABLE,
NORMAL, TONE, TONE2, TONE3, INITIALS, FIRST_LETTER,
FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3,
BOPOMOFO, BOPOMOFO_FIRST
)
from .utils import simple_seg, _replace_tone2_style_dict_to_default

Expand Down Expand Up @@ -152,7 +154,8 @@ def _replace(m):
else:
return re.sub(RE_TONE2, r'\1', PHONETIC_SYMBOL[symbol])
# 使用数字标识声调
elif style in [TONE2, FINALS_TONE2]:
elif style in [TONE2, TONE3, FINALS_TONE2, FINALS_TONE3,
BOPOMOFO, BOPOMOFO_FIRST]:
# 返回使用数字标识声调的字符
return PHONETIC_SYMBOL[symbol]
# 声调在头上
Expand All @@ -161,17 +164,28 @@ def _replace(m):

# 替换拼音中的带声调字符
py = re.sub(RE_PHONETIC_SYMBOL, _replace, pinyin)
# 将声调移动到最后
if style in [TONE3, FINALS_TONE3, BOPOMOFO, BOPOMOFO_FIRST]:
py = RE_TONE3.sub(r'\1\3\2', py)

# 首字母
if style == FIRST_LETTER:
py = py[0]
# 韵母
elif style in [FINALS, FINALS_TONE, FINALS_TONE2]:
# 不处理鼻音: 'ḿ', 'ń', 'ň', 'ǹ '
elif style in [FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3]:
# 不处理鼻音: 'ḿ', 'ń', 'ň', 'ǹ'
if pinyin and pinyin[0] not in [
'\u1e3f', '\u0144', '\u0148', '\u01f9'
]:
py = final(py)
# 声调在拼音之后、注音
elif style in [BOPOMOFO, BOPOMOFO_FIRST]:
# 查表替换成注音
for f, r in BOPOMOFO_REPLACE:
py = f.sub(r, py)
py = ''.join(BOPOMOFO_TABLE.get(x, x) for x in py)
if style == BOPOMOFO_FIRST:
py = py[0]
return py


Expand Down
3 changes: 2 additions & 1 deletion pypinyin/phonetic_symbol.py
Expand Up @@ -25,7 +25,8 @@
"ú": "u2",
"ǔ": "u3",
"ù": "u4",
"ü": "v0",
# üe
"ü": "v",
"ǘ": "v2",
"ǚ": "v3",
"ǜ": "v4",
Expand Down
37 changes: 33 additions & 4 deletions tests/test_pinyin.py
Expand Up @@ -7,8 +7,9 @@

from pypinyin import (
pinyin, slug, lazy_pinyin, load_single_dict,
load_phrases_dict, NORMAL, TONE, TONE2, INITIALS,
FIRST_LETTER, FINALS, FINALS_TONE, FINALS_TONE2
load_phrases_dict, NORMAL, TONE, TONE2, TONE3, INITIALS,
FIRST_LETTER, FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3,
BOPOMOFO, BOPOMOFO_FIRST
)
from pypinyin.compat import SUPPORT_UCS4
from .utils import has_module
Expand All @@ -23,21 +24,29 @@ def test_pinyin_initials():
assert pinyin(hans, NORMAL) == [['zhong'], ['xin']]
# 声调风格,拼音声调在韵母第一个字母上
assert pinyin(hans, TONE) == [['zh\u014dng'], ['x\u012bn']]
# 声调风格2,即拼音声调在各个拼音之后,用数字 [0-4] 进行表示
# 声调风格2,即拼音声调在各个声母之后,用数字 [1-4] 进行表示
assert pinyin(hans, TONE2) == [['zho1ng'], ['xi1n']]
# 声调风格3,即拼音声调在各个拼音之后,用数字 [1-4] 进行表示
assert pinyin(hans, TONE3) == [['zhong1'], ['xin1']]
# 声母风格,只返回各个拼音的声母部分
assert pinyin(hans, INITIALS) == [['zh'], ['x']]
# 首字母风格,只返回拼音的首字母部分
assert pinyin(hans, FIRST_LETTER) == [['z'], ['x']]
# 注音风格,带声调
assert pinyin(hans, BOPOMOFO) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
# 注音风格,首字母
assert pinyin(hans, BOPOMOFO_FIRST) == [['ㄓ'], ['ㄒ']]
# 启用多音字模式
assert pinyin(hans, heteronym=True) == [['zh\u014dng', 'zh\xf2ng'],
['x\u012bn']]
# 韵母风格1,只返回各个拼音的韵母部分,不带声调
assert pinyin(hans, style=FINALS) == [['ong'], ['in']]
# 韵母风格2,带声调,声调在韵母第一个字母上
assert pinyin(hans, style=FINALS_TONE) == [['\u014dng'], ['\u012bn']]
# 韵母风格2,带声调,声调在各个拼音之后,用数字 [0-4] 进行表示
# 韵母风格2,带声调,声调在各个声母之后,用数字 [1-4] 进行表示
assert pinyin(hans, style=FINALS_TONE2) == [['o1ng'], ['i1n']]
# 韵母风格3,带声调,声调在各个拼音之后,用数字 [1-4] 进行表示
assert pinyin(hans, style=FINALS_TONE3) == [['ong1'], ['in1']]


def test_pinyin_finals():
Expand All @@ -48,14 +57,18 @@ def test_pinyin_finals():
assert pinyin(hans, NORMAL) == [['ao'], ['ao']]
assert pinyin(hans, TONE) == [['\xe1o'], ['\xe1o']]
assert pinyin(hans, TONE2) == [['a2o'], ['a2o']]
assert pinyin(hans, TONE3) == [['ao2'], ['ao2']]
assert pinyin(hans, INITIALS) == [[''], ['']]
assert pinyin(hans, FIRST_LETTER) == [['a'], ['a']]
assert pinyin(hans, BOPOMOFO) == [['ㄠˊ'], ['ㄠˊ']]
assert pinyin(hans, BOPOMOFO_FIRST) == [['ㄠ'], ['ㄠ']]
assert pinyin(hans, heteronym=True) == [['\xe1o'], ['\xe1o']]
assert pinyin('啊', heteronym=True) == \
[['a', 'è', 'ā', 'á', 'ǎ', 'à']]
assert pinyin(hans, style=FINALS) == [['ao'], ['ao']]
assert pinyin(hans, style=FINALS_TONE) == [['\xe1o'], ['\xe1o']]
assert pinyin(hans, style=FINALS_TONE2) == [['a2o'], ['a2o']]
assert pinyin(hans, style=FINALS_TONE3) == [['ao2'], ['ao2']]


def test_slug():
Expand Down Expand Up @@ -99,6 +112,7 @@ def test_lazy_pinyin():
assert lazy_pinyin('中心') == ['zhong', 'xin']
assert lazy_pinyin('中心', style=TONE) == ['zh\u014dng', 'x\u012bn']
assert lazy_pinyin('中心', style=INITIALS) == ['zh', 'x']
assert lazy_pinyin('中心', style=BOPOMOFO) == ['ㄓㄨㄥ', 'ㄒㄧㄣ']


@pytest.mark.skipif(not has_module('jieba'), reason='cant import jieba')
Expand Down Expand Up @@ -241,31 +255,41 @@ def test_simple_seg():
# 误把 yu 放到声母列表了
['鱼', {'style': TONE2}, ['yu2']],
['鱼', {'style': FINALS}, ['v']],
['鱼', {'style': BOPOMOFO}, ['ㄩˊ']],
['雨', {'style': TONE2}, ['yu3']],
['雨', {'style': FINALS}, ['v']],
['雨', {'style': BOPOMOFO}, ['ㄩˇ']],
['元', {'style': TONE2}, ['yua2n']],
['元', {'style': FINALS}, ['van']],
['元', {'style': BOPOMOFO}, ['ㄩㄢˊ']],
# y, w 也不是拼音, yu的韵母是v, yi的韵母是i, wu的韵母是u
['呀', {'style': INITIALS}, ['']],
['呀', {'style': TONE2}, ['ya']],
['呀', {'style': FINALS}, ['ia']],
['呀', {'style': BOPOMOFO}, ['ㄧㄚ˙']],
['无', {'style': INITIALS}, ['']],
['无', {'style': TONE2}, ['wu2']],
['无', {'style': FINALS}, ['u']],
['无', {'style': BOPOMOFO}, ['ㄨˊ']],
['衣', {'style': TONE2}, ['yi1']],
['衣', {'style': FINALS}, ['i']],
['衣', {'style': BOPOMOFO}, ['ㄧ']],
['万', {'style': TONE2}, ['wa4n']],
['万', {'style': FINALS}, ['uan']],
['万', {'style': BOPOMOFO}, ['ㄨㄢˋ']],
# ju, qu, xu 的韵母应该是 v
['具', {'style': FINALS_TONE}, ['ǜ']],
['具', {'style': FINALS_TONE2}, ['v4']],
['具', {'style': FINALS}, ['v']],
['具', {'style': BOPOMOFO}, ['ㄐㄩˋ']],
['取', {'style': FINALS_TONE}, ['ǚ']],
['取', {'style': FINALS_TONE2}, ['v3']],
['取', {'style': FINALS}, ['v']],
['取', {'style': BOPOMOFO}, ['ㄑㄩˇ']],
['徐', {'style': FINALS_TONE}, ['ǘ']],
['徐', {'style': FINALS_TONE2}, ['v2']],
['徐', {'style': FINALS}, ['v']],
['徐', {'style': BOPOMOFO}, ['ㄒㄩˊ']],
# ń
['嗯', {'style': NORMAL}, ['n']],
['嗯', {'style': TONE}, ['ń']],
Expand All @@ -275,6 +299,7 @@ def test_simple_seg():
['嗯', {'style': FINALS}, ['n']],
['嗯', {'style': FINALS_TONE}, ['ń']],
['嗯', {'style': FINALS_TONE2}, ['n2']],
['嗯', {'style': BOPOMOFO}, ['ㄣˊ']],
# ḿ \u1e3f U+1E3F
['呣', {'style': NORMAL}, ['m']],
['呣', {'style': TONE}, ['ḿ']],
Expand All @@ -284,8 +309,12 @@ def test_simple_seg():
['呣', {'style': FINALS}, ['m']],
['呣', {'style': FINALS_TONE}, ['ḿ']],
['呣', {'style': FINALS_TONE2}, ['m2']],
['呣', {'style': BOPOMOFO}, ['ㄇㄨˊ']],
# 41
['彷徨', {}, ['pang', 'huang']],
# 注音
['打量', {'style': BOPOMOFO}, ['ㄉㄚˇ', 'ㄌㄧㄤ˙']],
['黄山b股', {'style': BOPOMOFO}, ['ㄏㄨㄤˊ', 'ㄕㄢ', 'b', 'ㄍㄨˇ']],
]


Expand Down

0 comments on commit c837bfc

Please sign in to comment.