-
-
Notifications
You must be signed in to change notification settings - Fork 604
/
test_mmseg.py
118 lines (103 loc) · 2.69 KB
/
test_mmseg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import pytest
from pypinyin import pinyin, load_phrases_dict
from pypinyin.contrib import mmseg
seg_test = mmseg.Seg(mmseg.PrefixSet())
seg_test._prefix_set.train([
'a',
'ab',
'abc',
'abcd',
'abd',
'ac',
'acd',
'aff',
'agf',
'agfgef',
'asdf',
'bbs'
'中国',
'中国人',
'中国人民',
'中国人民银行',
'我',
'北京',
'天安门',
'员工',
])
@pytest.mark.parametrize(
'input, expect', [
['', []],
['a', ['a']],
['abc', ['abc']],
['abcefg', ['abc', 'e', 'f', 'g']],
['bbcabce', ['bb', 'c', 'abc', 'e']],
['北京', ['北京']],
['北京,', ['北京', ',']],
['北京abc', ['北京', 'abc']],
['中国人民银行行长', ['中国人民银行', '行', '长']],
['中国人民银行员工', ['中国人民银行', '员工']],
[
'abcadbasfgafgasdabcagfaff我是中国人中国人民我爱北京天安门',
[
'abc',
'a',
'd',
'b',
'as',
'f',
'g',
'af',
'g',
'asd',
'abc',
'agf',
'aff',
'我',
'是',
'中国人',
'中国人民',
'我',
'爱',
'北京',
'天安门',
],
],
]
)
def test_mmseg(input, expect):
assert list(seg_test.cut(input)) == expect
@pytest.mark.parametrize(
'input, default_ret, mmseg_ret', [
[
'一语中的啊',
[['yī'], ['yǔ'], ['zhōng'], ['de'], ['a']],
[['yī'], ['yǔ'], ['zhòng'], ['dì'], ['a']],
],
]
)
def test_mmseg_for_pinyin(input, default_ret, mmseg_ret):
assert pinyin(input) == mmseg_ret
assert pinyin(mmseg.seg.cut(input)) == mmseg_ret
@pytest.mark.parametrize(
'input, jieba_ret, mmseg_ret', [
[
'了局啊',
[['le'], ['jú'], ['a']],
[['liǎo'], ['jú'], ['a']],
],
]
)
def test_mmseg_and_jieba_for_pinyin(input, jieba_ret, mmseg_ret):
assert pinyin(input) == mmseg_ret
assert pinyin(mmseg.seg.cut(input)) == mmseg_ret
def test_retrain():
seg = mmseg.seg
assert list(seg.cut('啊啊啊')) == ['啊', '啊', '啊']
load_phrases_dict({'啊啊啊': [['a'], ['a'], ['a']]})
mmseg.retrain(seg)
assert list(seg.cut('啊啊啊')) == ['啊啊啊']
if __name__ == '__main__':
import pytest
pytest.cmdline.main()