Skip to content

Commit

Permalink
update tools
Browse files Browse the repository at this point in the history
  • Loading branch information
mozillazg committed Dec 7, 2014
1 parent 6f14388 commit 308e7fd
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 17 deletions.
2 changes: 1 addition & 1 deletion pypinyin/phrases_dict.py
Expand Up @@ -1599,7 +1599,7 @@
"朝廷": [["cháo"], ["tíng"]],
"朝夕": [["zhāo"], ["xī"]],
"朝霞": [["zhāo"], ["xiá"]],
"朝阳": [["zhāo"], ["yáng"]],
"朝阳": [["zhāo", "cháo"], ["yáng"]],
"朝向": [["cháo"], ["xiàng"]],
"朝野": [["cháo"], ["yě"]],
"朝政": [["cháo"], ["zhèng"]],
Expand Down
51 changes: 35 additions & 16 deletions tools/get_words_from_zdic_by_unicode.py
Expand Up @@ -11,6 +11,8 @@
from bs4 import BeautifulSoup
import requests

from pypinyin.pinyin_dict import pinyin_dict


class Message(object):
def __init__(self, file_name):
Expand Down Expand Up @@ -51,28 +53,42 @@ def parse_pinyin(html):
word_html = soup.find(id='ziip').text.encode('raw_unicode_escape').decode('utf8')
words = re.findall(ur'“([^”]+)”', word_html)
word = words[0] if words else ''

try:
pinyins = [x.text for x in soup.select('td.z_i_t2_py')[0].select('a')]
pinyins = [x.encode('raw_unicode_escape').decode('utf8') for x in pinyins]
except:
pinyins = []
except Exception as e:
e.word = word
raise
return word, pinyins


def get_word(n, url_base, headers, cookies):
url = url_base % '{0:x}'.format(n)
print hex(n)
try:
html = request(url, headers, cookies)
unicode_num, url = parse_word_url(html)
html = request(url, headers, cookies)
word, pinyins = parse_pinyin(html)
# print unicode_num, repr(word), pinyins
return unicode_num, word, pinyins
except Exception as e:
print e
return '{0:x}'.format(n).upper(), getattr(e, 'word', ''), []


def get_words(unicode_range, url_base, headers, cookies):
m = 0
for n in xrange(int(unicode_range[0], 16), int(unicode_range[1], 16) + 1):
url = url_base % '{0:x}'.format(n)
print n,
html = request(url, headers, cookies)
try:
unicode_num, url = parse_word_url(html)
html = request(url, headers, cookies)
word, pinyins = parse_pinyin(html)
print unicode_num, repr(word), pinyins
yield unicode_num, word, pinyins
except Exception as e:
print e
yield '{0:x}'.format(n).upper(), '', []
if n in pinyin_dict:
continue
if m > 900:
m = 0
sleep(120)
m += 1

yield get_word(n, url_base, headers, cookies)
sleep(1)


Expand All @@ -86,6 +102,7 @@ def main():
('4E00', '9FFF'), # CJK 基本:[4E00-9FFF]
('F900', 'FAFF'), # CJK 兼容:[F900-FAFF]
('20000', '2A6DF'), # CJK 扩展 B:[20000-2A6DF]
('20970', '2A6DF'), # CJK 扩展 B:[20000-2A6DF]
('2A700', '2B73F'), # CJK 扩展 C:[2A700-2B73F]
('2B740', '2B81D'), # CJK 扩展 D:[2B740-2B81D]
('2F800', '2FA1F'), # CJK 兼容扩展:[2F800-2FA1F]
Expand Down Expand Up @@ -116,8 +133,10 @@ def main():
','.join(pinyins),
word))
else:
f.write(u"# 0x{0}: '{1}', # {2}\n".format(unicode_num,
'', word))
if word:
word = ' ' + word
f.write(u"# 0x{0}: '{1}', #{2}\n".format(unicode_num,
'', word))


if __name__ == '__main__':
Expand Down

0 comments on commit 308e7fd

Please sign in to comment.