Skip to content

Commit

Permalink
Merge 4acbf10 into ba61a15
Browse files Browse the repository at this point in the history
  • Loading branch information
mozillazg committed Sep 21, 2017
2 parents ba61a15 + 4acbf10 commit 1a2126b
Show file tree
Hide file tree
Showing 12 changed files with 115 additions and 56 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
@@ -1,7 +1,7 @@
[bumpversion]
commit = True
tag = True
current_version = 0.23.0
current_version = 0.24.0

[bumpversion:file:pypinyin/__init__.py]

27 changes: 11 additions & 16 deletions .travis.yml
@@ -1,23 +1,28 @@
language: python
python:
- 3.5

sudo: false
dist: trusty
python: 3.6
sudo: required
before_install:
- sudo add-apt-repository -y ppa:deadsnakes/ppa
- sudo add-apt-repository -y ppa:pypy/ppa
- sudo apt-get -qq update
- sudo apt-get -y install python2.6 python3.3 python3.5 pypy

env:
- TOX_ENV=py26
- TOX_ENV=py27
- TOX_ENV=py33
- TOX_ENV=py34
- TOX_ENV=py35
- TOX_ENV=py36
- TOX_ENV=pypy
- TOX_ENV=py2_snownlp
- TOX_ENV=py2_jieba
- TOX_ENV=py3_jieba
- TOX_ENV=py3_env

install:
- pip install coveralls
- pip install coveralls codecov
- pip install tox
- pip install -r requirements_dev.txt

Expand All @@ -30,15 +35,5 @@ script:
- pypinyin < setup.cfg

after_script:
- codecov
- coveralls

notifications:
email:
on_success: never
on_failure: always

matrix:
include:
- python: 3.6
env:
- TOX_ENV=py36
9 changes: 9 additions & 0 deletions CHANGELOG.rst
@@ -1,6 +1,13 @@
Changelog
---------

0.24.0 (2017-09-17)
+++++++++++++++++++++

* **[New]** 支持类似 pyinstaller 的打包工具对使用 pypinyin 的程序进行打包,
不会出现跟打包前不一样的输出(比如: `#92`_ )(via `#93`_ )。


0.23.0 (2017-07-09)
+++++++++++++++++++++

Expand Down Expand Up @@ -489,3 +496,5 @@ __ https://github.com/mozillazg/python-pinyin/issues/8
.. _phrase-pinyin-data: https://github.com/mozillazg/phrase-pinyin-data
.. _@LevyLession: https://github.com/LevyLession
.. _#86: https://github.com/mozillazg/python-pinyin/issues/86
.. _#92: https://github.com/mozillazg/python-pinyin/issues/92
.. _#93: https://github.com/mozillazg/python-pinyin/issues/93
11 changes: 5 additions & 6 deletions README.rst
Expand Up @@ -45,19 +45,18 @@ Python 3(Python 2 下把 ``'中心'`` 替换为 ``u'中心'`` 即可):

.. code-block:: python
>>> from pypinyin import pinyin, lazy_pinyin
>>> import pypinyin
>>> from pypinyin import pinyin, lazy_pinyin, Style
>>> pinyin('中心')
[['zhōng'], ['xīn']]
>>> pinyin('中心', heteronym=True) # 启用多音字模式
[['zhōng', 'zhòng'], ['xīn']]
>>> pinyin('中心', style=pypinyin.FIRST_LETTER) # 设置拼音风格
>>> pinyin('中心', style=Style.FIRST_LETTER) # 设置拼音风格
[['z'], ['x']]
>>> pinyin('中心', style=pypinyin.TONE2, heteronym=True)
>>> pinyin('中心', style=Style.TONE2, heteronym=True)
[['zho1ng', 'zho4ng'], ['xi1n']]
>>> pinyin('中心', style=pypinyin.BOPOMOFO) # 注音风格
>>> pinyin('中心', style=Style.BOPOMOFO) # 注音风格
[['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
>>> pinyin('中心', style=pypinyin.CYRILLIC) # 俄语字母风格
>>> pinyin('中心', style=Style.CYRILLIC) # 俄语字母风格
[['чжун1'], ['синь1']]
>>> lazy_pinyin('中心') # 不考虑多音字的情况
['zhong', 'xin']
Expand Down
27 changes: 13 additions & 14 deletions docs/usage.rst
Expand Up @@ -7,15 +7,14 @@

.. code-block:: python
>>> from pypinyin import pinyin, lazy_pinyin
>>> import pypinyin
>>> from pypinyin import pinyin, lazy_pinyin, Style
>>> pinyin('中心')
[['zhōng'], ['xīn']]
>>> pinyin('中心', heteronym=True) # 启用多音字模式
[['zhōng', 'zhòng'], ['xīn']]
>>> pinyin('中心', style=pypinyin.FIRST_LETTER) # 设置拼音风格
>>> pinyin('中心', style=Style.FIRST_LETTER) # 设置拼音风格
[['z'], ['x']]
>>> pinyin('中心', style=pypinyin.TONE2, heteronym=True)
>>> pinyin('中心', style=Style.TONE2, heteronym=True)
[['zho1ng', 'zho4ng'], ['xi1n']]
>>> lazy_pinyin('中心') # 不考虑多音字的情况
['zhong', 'xin']
Expand Down Expand Up @@ -83,13 +82,13 @@

.. code-block:: python
>> from pypinyin import lazy_pinyin, TONE2
>> from pypinyin import lazy_pinyin, Style
>> from snownlp import SnowNLP
>> hans = '音乐123'
>> hans_seg = SnowNLP(hans).words # 分词处理
>> hans_seg
['音乐', '123']
>> lazy_pinyin(hans_seg, style=TONE2)
>> lazy_pinyin(hans_seg, style=Style.TONE2)
['yi1n', 'yue4', '123']
Expand All @@ -106,32 +105,32 @@

.. code-block:: python
>> from pypinyin import lazy_pinyin, load_phrases_dict, TONE2
>> from pypinyin import lazy_pinyin, load_phrases_dict, Style
>> hans = '桔子'
>> lazy_pinyin(hans, style=TONE2)
>> lazy_pinyin(hans, style=Style.TONE2)
['jie2', 'zi3']
>> load_phrases_dict({'桔子': [[''], ['']]})
>> lazy_pinyin(hans, style=TONE2)
>> lazy_pinyin(hans, style=Style.TONE2)
['ju2', 'zi3']
**未安装 jieba 分词模块 and/or 不支持分词的词组**

.. code-block:: python
>> from pypinyin import lazy_pinyin, load_phrases_dict, TONE2, load_single_dict
>> from pypinyin import lazy_pinyin, load_phrases_dict, Style, load_single_dict
>> hans = '还没'
>> lazy_pinyin(hans, style=TONE2)
>> lazy_pinyin(hans, style=Style.TONE2)
['hua2n', 'me2i']
>>> # 第一种自定义词组的方法
>> load_phrases_dict({'还没': [['hái'], ['méi']]})
>>> lazy_pinyin('还没', style=TONE2)})
>>> lazy_pinyin('还没', style=Style.TONE2)})
['hua2n', 'me2i']
>>> lazy_pinyin(['还没'], style=TONE2) # 手动指定 "还没" 为一个词组
>>> lazy_pinyin(['还没'], style=Style.TONE2) # 手动指定 "还没" 为一个词组
['ha2i', 'me2i']
>>> # 第二种自定义词组的方法
>> load_single_dict({ord(''): 'hái,huán'}) # 调整 "还" 字的拼音顺序
>>> lazy_pinyin('还没', style=TONE2)
>>> lazy_pinyin('还没', style=Style.TONE2)
['ha2i', 'me2i']
Expand Down
2 changes: 1 addition & 1 deletion pypinyin/__init__.py
Expand Up @@ -28,7 +28,7 @@
)

__title__ = 'pypinyin'
__version__ = '0.23.0'
__version__ = '0.24.0'
__author__ = 'mozillazg, 闲耘'
__license__ = 'MIT'
__copyright__ = 'Copyright (c) 2016 mozillazg, 闲耘'
Expand Down
Empty file added pypinyin/contrib/__init__.py
Empty file.
56 changes: 56 additions & 0 deletions pypinyin/contrib/mmseg.py
@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
"""最大正向匹配分词"""


class Seg(object):
def __init__(self, prefix_set):
self.prefix_set = prefix_set

def cut(self, text):
"""分词
:param text: 待分词的文本
:yield: 单个词语
"""
remain = text
while remain:
matched = ''
# 一次加一个字的匹配
for index in range(len(remain)):
word = remain[:index + 1]
if word in self.prefix_set:
matched = word
else:
# 前面的字符串是个词语
if matched:
yield matched
matched = ''
remain = remain[index:]
else: # 前面为空
yield word
remain = remain[index + 1:]
# 有结果了,剩余的重新开始匹配
break
else: # 整个文本就是一个词语
yield remain
break


class PrefixSet(object):
def __init__(self):
self._prefix_set = set()

def train(self, word_s):
"""更新 prefix set
:param word_s: 词语库列表
:type word_s: iterable
:return: None
"""
for word in word_s:
# 把词语的每个前缀更新到 prefix_set 中
for index in range(len(word)):
self._prefix_set.add(word[:index + 1])

def __contains__(self, key):
return key in self._prefix_set
11 changes: 7 additions & 4 deletions pypinyin/core.py
Expand Up @@ -210,14 +210,15 @@ def pinyin(hans, style=TONE, heteronym=False, errors='default', strict=False):
也可以使用自己喜爱的分词模块对字符串进行分词处理,
只需将经过分词处理的字符串列表传进来就可以了。
:type hans: unicode 字符串或字符串列表
:param style: 指定拼音风格,默认是 ``TONE`` 风格
:param style: 指定拼音风格,默认是 :py:attr:`~pypinyin.Style.TONE` 风格。
更多拼音风格详见 :class:`~pypinyin.Style`
:param errors: 指定如何处理没有拼音的字符
* ``'default'``: 保留原始字符
* ``'ignore'``: 忽略该字符
* ``'replace'``: 替换为去掉 ``\\u`` 的 unicode 编码字符串
(``'\\u90aa'`` => ``'90aa'``)
* callable 对象: 回调函数之类的可调用对象。如果 ``erros``
* callable 对象: 回调函数之类的可调用对象。如果 ``errors``
参数 的值是个可调用对象,那么程序会回调这个函数:
``func(char)``::
Expand Down Expand Up @@ -260,7 +261,8 @@ def slug(hans, style=NORMAL, heteronym=False, separator='-',
:param hans: 汉字
:type hans: unicode or list
:param style: 指定拼音风格,默认是 ``NORMAL`` 风格
:param style: 指定拼音风格,默认是 :py:attr:`~pypinyin.Style.NORMAL` 风格。
更多拼音风格详见 :class:`~pypinyin.Style`
:param heteronym: 是否启用多音字
:param separstor: 两个拼音间的分隔符/连接符
:param errors: 指定如何处理没有拼音的字符,详情请参考
Expand Down Expand Up @@ -293,7 +295,8 @@ def lazy_pinyin(hans, style=NORMAL, errors='default', strict=True):
:param hans: 汉字
:type hans: unicode or list
:param style: 指定拼音风格,默认是 ``NORMAL`` 风格
:param style: 指定拼音风格,默认是 :py:attr:`~pypinyin.Style.NORMAL` 风格。
更多拼音风格详见 :class:`~pypinyin.Style`。
:param errors: 指定如何处理没有拼音的字符,详情请参考
:py:func:`~pypinyin.pinyin`
:param strict: 是否严格遵照《汉语拼音方案》来处理声母和韵母
Expand Down
21 changes: 9 additions & 12 deletions pypinyin/style/__init__.py
@@ -1,9 +1,6 @@
# -*- coding: utf-8 -*-
from functools import wraps
import glob
import os

current_dir = os.path.dirname(os.path.realpath(__file__))
# 存储各拼音风格对应的实现
_registry = {}

Expand Down Expand Up @@ -53,12 +50,12 @@ def wrapper(pinyin, **kwargs):


def auto_discover():
"""自动发现内置的拼音风格实现"""
for path in glob.glob(current_dir + os.path.sep + '*.py'):
filename = os.path.basename(path)
module_name = filename.split('.')[0]
if (not module_name) or module_name.startswith('_'):
continue

full_module_name = 'pypinyin.style.{0}'.format(module_name)
__import__(full_module_name)
"""自动注册内置的拼音风格实现"""
from pypinyin.style import ( # noqa
initials,
tone,
finals,
bopomofo,
cyrillic,
others,
)
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -19,6 +19,7 @@

packages = [
'pypinyin',
'pypinyin.contrib',
'pypinyin.style',
]

Expand Down
4 changes: 2 additions & 2 deletions tox.ini
Expand Up @@ -27,7 +27,7 @@ deps =
{[jieba]deps}

[testenv:py3_jieba]
basepython = python3.5
basepython = python3.6
deps =
{[jieba]deps}

Expand All @@ -38,7 +38,7 @@ deps =
{[base]deps}

[testenv:py3_env]
basepython = python3.5
basepython = python3.6
deps =
{[jieba]deps}
setenv =
Expand Down

0 comments on commit 1a2126b

Please sign in to comment.