-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a96dc5d
commit b7e69a7
Showing
3 changed files
with
261 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
Latin-1 Supplement - Unicode U+0080 - U+00FF - (128-255) | ||
á = á = á = á | ||
à = à = à = à | ||
é = é = é = é | ||
è = è = è = è | ||
í = í = í = í | ||
ì = ì = ì = ì | ||
ó = ó = ó = ó | ||
ò = ò = ò = ò | ||
ú = ú = ú = ó | ||
ù = ù = ù = ù | ||
ü = ü = ü = ü | ||
subtract 32 for upper case | ||
|
||
Latin Extended-A - Unicode U+0100 - U+017F - (256-383) | ||
ā = ā = ā | ||
ē = ē = ē | ||
ě = ě = ě | ||
ī = ī = ī | ||
ō = ō = ō | ||
ū = ū = ū | ||
subtract 1 for upper case | ||
|
||
Latin Extended-B U+0180 - U+024F (384-591) | ||
ǎ = ǎ = ǎ | ||
ǐ = ǐ = ǐ | ||
ǒ = ǒ = ǒ | ||
ǔ = ǔ = ǔ | ||
|
||
ǖ = ǖ = ǖ | ||
ǘ = ǘ = ǘ | ||
ǚ = ǚ = ǚ | ||
ǜ = ǜ = ǜ | ||
subtract 1 for upper case | ||
|
||
ā á ǎ à a | ||
ē é ě è e | ||
ī í ǐ ì i | ||
ō ó ǒ ò o | ||
ū ú ǔ ù u | ||
ǖ ǘ ǚ ǜ ü |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#!/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
|
||
""" Unit tests for pinyinizer.py """ | ||
import sys | ||
sys.path.append("/..") | ||
|
||
import pinyinizer | ||
|
||
def getKnownValues(filename): | ||
""" Load file of known values for testing. """ | ||
|
||
known_values = {} | ||
current_type = None | ||
|
||
with open(filename) as f: | ||
for line in f: | ||
if line.startswith('#'): | ||
current_type = line[2:].strip() | ||
known_values[current_type] = [] | ||
else: | ||
if len(line)>2 and current_type: | ||
known_values[current_type].append(tuple(line.strip().split('\t'))) | ||
|
||
return known_values | ||
|
||
def runTests(): | ||
known_values = getKnownValues("unit_tests_known_values.txt") | ||
|
||
passed = 0 | ||
for test_type, test_set in known_values.iteritems(): | ||
failures = [] | ||
|
||
for (test_in, test_out) in test_set: | ||
result = pinyinizer.addToneMarks(test_in) | ||
if result != test_out: | ||
failures.append(" %s -> %s (expecting: %s)" % (test_in, result, test_out)) | ||
else: | ||
passed += 1 | ||
|
||
if failures: | ||
print ' In "%s", %d of %d failed:' % (test_type, len(failures), len(test_set)) | ||
for failure in failures: | ||
print failure | ||
|
||
print "Passed %d tests" % passed | ||
|
||
if __name__ == "__main__": | ||
runTests() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
# Words that should not change | ||
ba ba | ||
pa6 pa6 | ||
zi zi | ||
zha zha | ||
web2 web2 | ||
negation1 negation1 | ||
|
||
# One initial + a | ||
ba0 ba | ||
pa1 pā | ||
ma2 má | ||
fa3 fǎ | ||
da4 dà | ||
ta5 ta | ||
na0 na | ||
la1 lā | ||
za2 zá | ||
ca3 cǎ | ||
sa4 sà | ||
ga5 ga | ||
ka0 ka | ||
ha1 hā | ||
ba2 bá | ||
pa3 pǎ | ||
ma4 mà | ||
fa5 fa | ||
|
||
# One initial + o | ||
bo0 bo | ||
po1 pō | ||
mo2 mó | ||
fo3 fǒ | ||
bo4 bò | ||
|
||
# One initial + e | ||
me0 me | ||
de5 de | ||
te1 tē | ||
ne2 né | ||
le3 lě | ||
ze4 zè | ||
ce1 cē | ||
se2 sé | ||
re3 rě | ||
|
||
# One initial + i | ||
zi0 zi | ||
ci1 cī | ||
si2 sí | ||
ji3 jǐ | ||
qi4 qì | ||
xi5 xi | ||
ri1 rī | ||
bi2 bí | ||
pi3 pǐ | ||
mi4 mì | ||
di5 di | ||
ti0 ti | ||
ni1 nī | ||
li2 lí | ||
|
||
# One initial + v | ||
lv lü | ||
nv0 nü | ||
lv1 lǖ | ||
nv2 nǘ | ||
lv3 lǚ | ||
nv4 nǜ | ||
lv5 lü | ||
|
||
# Two initials + vowel | ||
zha0 zha | ||
che1 chē | ||
shi2 shí | ||
zhu3 zhǔ | ||
|
||
# Initial + an | ||
an0 an | ||
an1 ān | ||
ban2 bán | ||
dan3 dǎn | ||
ran4 ràn | ||
gan5 gan | ||
zhan0 zhan | ||
chan1 chān | ||
shan2 shán | ||
|
||
# Initial + en | ||
en0 en | ||
en2 én | ||
pen1 pēn | ||
den3 děn | ||
nen4 nèn | ||
cen5 cen | ||
shen1 shēn | ||
hen3 hěn | ||
|
||
# Initial + ang | ||
ang0 ang | ||
ang2 áng | ||
pang1 pāng | ||
tang3 tǎng | ||
cang4 càng | ||
kang5 kang | ||
zhang1 zhāng | ||
chang0 chang | ||
shang2 sháng | ||
|
||
# Initial + eng | ||
meng0 meng | ||
leng1 lēng | ||
zeng2 zéng | ||
sheng3 shěng | ||
reng4 rèng | ||
geng5 geng | ||
|
||
# Initial + ong | ||
dong0 dong | ||
cong1 cōng | ||
zong2 zóng | ||
zhong3 zhǒng | ||
rong4 ròng | ||
kong5 kong | ||
|
||
# Initial + in | ||
yin0 yin | ||
bin1 bīn | ||
pin2 pín | ||
min3 mǐn | ||
nin4 nìn | ||
lin5 lin | ||
jin1 jīn | ||
qin2 qín | ||
xin3 xǐn | ||
|
||
# Initial + ian | ||
bian0 bian | ||
pian1 piān | ||
mian2 mián | ||
dian3 diǎn | ||
tian4 tiàn | ||
nian5 nian | ||
lian1 liān | ||
jian2 jián | ||
qian3 qiǎn | ||
xian4 xiàn | ||
|
||
# Initial + uan | ||
zhuan0 zhuan | ||
chuan1 chuān | ||
shuan2 shuán | ||
juan3 juǎn | ||
quan4 quàn | ||
xuan5 xuan | ||
|
||
# With r | ||
er0 er | ||
er2 ér | ||
dianr3 diǎnr | ||
shir4 shìr | ||
huar1 huār | ||
nar2 nár | ||
|
||
# Compound words | ||
ni3hao3 nǐhaǒ | ||
pin1yin1 pīnyīn | ||
|
||
# Sentences | ||
wo3 ai4 ni3 wǒ aì nǐ |