Skip to content

Commit

Permalink
Merge branch 'improve-FINALS' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
mozillazg committed May 5, 2017
2 parents cc29176 + 806a6fc commit 3965245
Show file tree
Hide file tree
Showing 11 changed files with 460 additions and 61 deletions.
20 changes: 19 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
Changelog
---------

0.19.0 (2017-05-05)
+++++++++++++++++++++

* **[New]** 韵母风格下根据 `汉语拼音方案`_ 还原原始的 ``iou`` , ``uei`` , ``uen`` 韵母。

iou,uei,uen前面加声母的时候,写成iu,ui,un。
例如niu(牛),gui(归),lun(论)。即:

* niu 的韵母是 iou
* gui 的韵母是 uei
* lun 的韵母是 uen
* **[Fixed]** 修复韵母相关风格下没有正确处理 ``wu`` 的韵母的问题
(比如: ```` 在 ``FINALS_TONE`` 风格下的结果是 ```` 的问题) 。
* **[Fixed]** 修复漏了 ǖ -> v1 的转换。



0.18.2 (2017-04-25)
+++++++++++++++++++++

Expand Down Expand Up @@ -390,7 +407,8 @@ __ https://github.com/mozillazg/python-pinyin/issues/8
.. _#22: https://github.com/mozillazg/python-pinyin/pull/22
.. _#26: https://github.com/mozillazg/python-pinyin/pull/26
.. _@MingStar: https://github.com/MingStar
.. _汉语拼音方案: http://www.edu.cn/20011114/3009777.shtml
.. _汉语拼音方案: https://zh.wiktionary.org/wiki/%E9%99%84%E5%BD%95:%E6%B1%89%E8%AF%AD%E6%8B%BC%E9%9F%B3%E6%96%B9%E6%A1%88
.. _汉语拼音方案.pdf: http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html
.. _汉语拼音 - 维基百科: https://zh.wikipedia.org/wiki/%E6%B1%89%E8%AF%AD%E6%8B%BC%E9%9F%B3#cite_ref-10
.. _@xulin97: https://github.com/xulin97
.. _#31: https://github.com/mozillazg/python-pinyin/issues/31
Expand Down
27 changes: 17 additions & 10 deletions pypinyin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,24 @@

from __future__ import unicode_literals

from .compat import PY2
from .constants import ( # noqa
STYLE_NORMAL, NORMAL, STYLE_TONE, TONE,
STYLE_TONE2, TONE2, STYLE_TONE3, TONE3,
STYLE_INITIALS, INITIALS, STYLE_FIRST_LETTER, FIRST_LETTER,
STYLE_FINALS, FINALS, STYLE_FINALS_TONE, FINALS_TONE,
STYLE_FINALS_TONE2, FINALS_TONE2, STYLE_FINALS_TONE3, FINALS_TONE3,
STYLE_BOPOMOFO, BOPOMOFO, STYLE_BOPOMOFO_FIRST, BOPOMOFO_FIRST,
STYLE_CYRILLIC, CYRILLIC, STYLE_CYRILLIC_FIRST, CYRILLIC_FIRST
from pypinyin.compat import PY2
from pypinyin.constants import ( # noqa
STYLE_NORMAL, NORMAL,
STYLE_TONE, TONE,
STYLE_TONE2, TONE2,
STYLE_TONE3, TONE3,
STYLE_INITIALS, INITIALS,
STYLE_FIRST_LETTER, FIRST_LETTER,
STYLE_FINALS, FINALS,
STYLE_FINALS_TONE, FINALS_TONE,
STYLE_FINALS_TONE2, FINALS_TONE2,
STYLE_FINALS_TONE3, FINALS_TONE3,
STYLE_BOPOMOFO, BOPOMOFO,
STYLE_BOPOMOFO_FIRST, BOPOMOFO_FIRST,
STYLE_CYRILLIC, CYRILLIC,
STYLE_CYRILLIC_FIRST, CYRILLIC_FIRST
)
from .core import ( # noqa
from pypinyin.core import ( # noqa
pinyin, lazy_pinyin, slug, load_single_dict, load_phrases_dict
)

Expand Down
4 changes: 2 additions & 2 deletions pypinyin/__main__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
from .runner import main

from pypinyin.runner import main

if __name__ == '__main__':
sys.exit(main())
20 changes: 6 additions & 14 deletions pypinyin/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
import os
import re

from . import phonetic_symbol, pinyin_dict
from .compat import SUPPORT_UCS4
from pypinyin import phonetic_symbol, pinyin_dict
from pypinyin.compat import SUPPORT_UCS4

# 词语拼音库
if os.environ.get('PYPINYIN_NO_PHRASES'):
PHRASES_DICT = {}
else:
from . import phrases_dict
from pypinyin import phrases_dict
PHRASES_DICT = phrases_dict.phrases_dict.copy()

# 单字拼音库
Expand Down Expand Up @@ -93,18 +93,10 @@
BOPOMOFO = STYLE_BOPOMOFO = PINYIN_STYLE['BOPOMOFO']
# 注音符号首字母
BOPOMOFO_FIRST = STYLE_BOPOMOFO_FIRST = PINYIN_STYLE['BOPOMOFO_FIRST']

# 俄语
CYRILLIC = STYLE_CYRILLIC = PINYIN_STYLE['CYRILLIC']

CYRILLIC_FIRST = STYLE_CYRILLIC_FIRST = PINYIN_STYLE['CYRILLIC_FIRST']

U_FINALS_EXCEPTIONS_MAP = {
u'ū': u'ǖ',
u'ú': u'ǘ',
u'ǔ': u'ǚ',
u'ù': u'ǜ',
}

# 注音转换表
BOPOMOFO_REPLACE = (
(re.compile('^m(\d)$'), 'mu\\1'), # 呣
(re.compile('^n(\d)$'), 'N\\1'), # 嗯
Expand Down Expand Up @@ -139,7 +131,7 @@
'bpmfdtnlgkhjqxZCSrzcsiuvaoeEAIOUMNKGR2340',
'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄧㄨㄩㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦˊˇˋ˙'
))

# 俄语转换表
CYRILLIC_REPLACE = (
(re.compile('ong'), 'ung'),
(re.compile('([zcs])i'), '\\1U'),
Expand Down
43 changes: 15 additions & 28 deletions pypinyin/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,19 @@
import re
import warnings

from .compat import text_type, callable_check
from .constants import (
from pypinyin.compat import text_type, callable_check
from pypinyin.constants import (
PHRASES_DICT, PINYIN_DICT, _INITIALS, PHONETIC_SYMBOL, RE_PHONETIC_SYMBOL,
RE_TONE2, RE_TONE3, RE_HANS, U_FINALS_EXCEPTIONS_MAP,
RE_TONE2, RE_TONE3, RE_HANS,
BOPOMOFO_REPLACE, BOPOMOFO_TABLE,
CYRILLIC_REPLACE, CYRILLIC_TABLE,
NORMAL, TONE, TONE2, TONE3, INITIALS, FIRST_LETTER,
FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3,
BOPOMOFO, BOPOMOFO_FIRST,
CYRILLIC, CYRILLIC_FIRST
)
from .utils import simple_seg, _replace_tone2_style_dict_to_default
from pypinyin.standard import convert_finals
from pypinyin.utils import simple_seg, _replace_tone2_style_dict_to_default


def seg(hans):
Expand Down Expand Up @@ -106,34 +107,14 @@ def final(pinyin):
:return: 韵母
:rtype: unicode
"""
initial_ = initial(pinyin) or None
initial_ = initial(pinyin) or ''
# 没有声母,整个都是韵母
if not initial_:
return no_initial_final(pinyin)
# 特例 j/q/x
m = re.match(r'^(j|q|x)(ū|ú|ǔ|ù)$', pinyin)
if m:
return (U_FINALS_EXCEPTIONS_MAP[m.group(2)])
pinyin = re.sub(r'^(j|q|x)u(\d?)$', r'\1v\2', pinyin)
return pinyin
# 按声母分割,剩下的就是韵母
return ''.join(pinyin.split(initial_, 1))


def no_initial_final(pinyin):
# 特例 y/w
if pinyin.startswith('y'):
if pinyin.startswith('yu'):
pinyin = 'v' + pinyin[2:]
elif pinyin.startswith('yi'):
pinyin = pinyin[1:]
else:
pinyin = 'i' + pinyin[1:]
elif pinyin.startswith('w'):
if pinyin.startswith('wu'):
pinyin = pinyin[1:]
else:
pinyin = 'u' + pinyin[1:]
return pinyin


def to_fixed(pinyin, style):
"""根据拼音风格格式化带声调的拼音.
Expand All @@ -145,6 +126,12 @@ def to_fixed(pinyin, style):
# 声母
if style == INITIALS:
return initial(pinyin)
if style == TONE:
return pinyin

# 根据标准还原正确的韵母
if style in [FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3]:
pinyin = convert_finals(pinyin)

def _replace(m):
symbol = m.group(0) # 带声调的字符
Expand Down
1 change: 1 addition & 0 deletions pypinyin/phonetic_symbol.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"ù": "u4",
# üe
"ü": "v",
"ǖ": "v1",
"ǘ": "v2",
"ǚ": "v3",
"ǜ": "v4",
Expand Down
4 changes: 2 additions & 2 deletions pypinyin/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
from argparse import ArgumentParser
import sys

from . import ( # noqa
from pypinyin import ( # noqa
__title__, __version__, pinyin, slug,
NORMAL, TONE, TONE2, TONE3, INITIALS, FIRST_LETTER,
FINALS, FINALS_TONE, FINALS_TONE2, FINALS_TONE3,
BOPOMOFO, BOPOMOFO_FIRST, CYRILLIC, CYRILLIC_FIRST
)
from .compat import PY2
from pypinyin.compat import PY2


class NullWriter(object):
Expand Down
152 changes: 152 additions & 0 deletions pypinyin/standard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
处理汉语拼音方案中的一些特殊情况
汉语拼音方案:
* https://zh.wiktionary.org/wiki/%E9%99%84%E5%BD%95:%E6%B1%89%E8%AF%AD%E6%8B%BC%E9%9F%B3%E6%96%B9%E6%A1%88
* http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html
""" # noqa
from __future__ import unicode_literals

import re

# u -> ü
UV_MAP = {
'u': 'ü',
'ū': 'ǖ',
'ú': 'ǘ',
'ǔ': 'ǚ',
'ù': 'ǜ',
}
U_TONES = set(UV_MAP.keys())
# ü行的韵跟声母j,q,x拼的时候,写成ju(居),qu(区),xu(虚)
UV_RE = re.compile(r'^(j|q|x)({tones})$'.format(tones='|'.join(UV_MAP.keys())))
I_TONES = set(['i', 'ī', 'í', 'ǐ', 'ì'])

# iu -> iou
IU_MAP = {
'iu': 'iou',
'iū': 'ioū',
'iú': 'ioú',
'iǔ': 'ioǔ',
'iù': 'ioù',
}
IU_TONES = set(IU_MAP.keys())
IU_RE = re.compile(r'^([a-z]+)({tones})$'.format(tones='|'.join(IU_TONES)))

# ui -> uei
UI_MAP = {
'ui': 'uei',
'uī': 'ueī',
'uí': 'ueí',
'uǐ': 'ueǐ',
'uì': 'ueì',
}
UI_TONES = set(UI_MAP.keys())
UI_RE = re.compile(r'([a-z]+)({tones})$'.format(tones='|'.join(UI_TONES)))

# un -> uen
UN_MAP = {
'un': 'uen',
'ūn': 'ūen',
'ún': 'úen',
'ǔn': 'ǔen',
'ùn': 'ùen',
}
UN_TONES = set(UN_MAP.keys())
UN_RE = re.compile(r'([a-z]+)({tones})$'.format(tones='|'.join(UN_TONES)))


def convert_zero_consonant(pinyin):
"""零声母转换,还原原始的韵母
i行的韵母,前面没有声母的时候,写成yi(衣),ya(呀),ye(耶),yao(腰),
you(忧),yan(烟),yin(因),yang(央),ying(英),yong(雍)。
u行的韵母,前面没有声母的时候,写成wu(乌),wa(蛙),wo(窝),wai(歪),
wei(威),wan(弯),wen(温),wang(汪),weng(翁)。
ü行的韵母,前面没有声母的时候,写成yu(迂),yue(约),yuan(冤),
yun(晕);ü上两点省略。
"""
# y: yu -> v, yi -> i, y -> i
if pinyin.startswith('y'):
# 去除 y 后的拼音
no_y_py = pinyin[1:]
first_char = no_y_py[0] if len(no_y_py) > 0 else None

# yu -> ü: yue -> üe
if first_char in U_TONES:
pinyin = UV_MAP[first_char] + pinyin[2:]
# yi -> i: yi -> i
elif first_char in I_TONES:
pinyin = no_y_py
# y -> i: ya -> ia
else:
pinyin = 'i' + no_y_py
return pinyin

# w: wu -> u, w -> u
if pinyin.startswith('w'):
# 去除 w 后的拼音
no_w_py = pinyin[1:]
first_char = no_w_py[0] if len(no_w_py) > 0 else None

# wu -> u: wu -> u
if first_char in U_TONES:
pinyin = pinyin[1:]
# w -> u: wa -> ua
else:
pinyin = 'u' + pinyin[1:]
return pinyin

return pinyin


def convert_uv(pinyin):
"""ü 转换,还原原始的韵母
ü行的韵跟声母j,q,x拼的时候,写成ju(居),qu(区),xu(虚),
ü上两点也省略;但是跟声母n,l拼的时候,仍然写成nü(女),lü(吕)。
"""
return UV_RE.sub(lambda m: m.group(1) + UV_MAP[m.group(2)], pinyin)


def convert_iou(pinyin):
"""iou 转换,还原原始的韵母
iou,uei,uen前面加声母的时候,写成iu,ui,un。
例如niu(牛),gui(归),lun(论)。
"""
return IU_RE.sub(lambda m: m.group(1) + IU_MAP[m.group(2)], pinyin)


def convert_uei(pinyin):
"""uei 转换,还原原始的韵母
iou,uei,uen前面加声母的时候,写成iu,ui,un。
例如niu(牛),gui(归),lun(论)。
"""
return UI_RE.sub(lambda m: m.group(1) + UI_MAP[m.group(2)], pinyin)


def convert_uen(pinyin):
"""uen 转换,还原原始的韵母
iou,uei,uen前面加声母的时候,写成iu,ui,un。
例如niu(牛),gui(归),lun(论)。
"""
return UN_RE.sub(lambda m: m.group(1) + UN_MAP[m.group(2)], pinyin)


def convert_finals(pinyin):
"""还原原始的韵母"""
pinyin = convert_zero_consonant(pinyin)
pinyin = convert_uv(pinyin)
pinyin = convert_iou(pinyin)
pinyin = convert_uei(pinyin)
pinyin = convert_uen(pinyin)
return pinyin

0 comments on commit 3965245

Please sign in to comment.