Skip to content

Commit

Permalink
support python3 now
Browse files Browse the repository at this point in the history
  • Loading branch information
mawenbao committed Apr 9, 2015
1 parent 81f8993 commit e37be34
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 8 deletions.
16 changes: 11 additions & 5 deletions extract-sougou-dict.py
Expand Up @@ -15,6 +15,12 @@

import argparse
import struct
import codecs

# check python version
import sys
if sys.version_info.major == 2:
range = xrange

gWordsOffset = 0x2628 # 词组列表的偏移地址

Expand All @@ -38,7 +44,7 @@ def extract_sougou_words(data):
numTongYinCi, pinYinTableLen = struct.unpack('<HH', data[offset:offset+4])
offset += (4 + pinYinTableLen)

for i in xrange(numTongYinCi):
for i in range(numTongYinCi):
wordLen = struct.unpack('<H', data[offset:offset+2])[0]
offset += 2
word = struct.unpack('<' + str(wordLen) + 's', data[offset:offset+wordLen])[0]
Expand Down Expand Up @@ -66,10 +72,10 @@ def extract_sougou_dict_files(pathList):
args = argParser.parse_args()

wordSet = extract_sougou_dict_files(args.dictfile)
with open(args.output, "w") as f:
with codecs.open(args.output, 'w', encoding='UTF-8') as f:
if not args.mmseg:
f.write('\n'.join(wordSet))
f.write(u'\n'.join(wordSet))
else:
for word in wordSet:
f.write('{}\t1\nx:1\n'.format(word.encode('UTF-8')))
print('成功从{}个搜狗词库中提取出{}个词组 => {}'.format(len(args.dictfile), len(wordSet), args.output))
f.write(u'{}\t1\nx:1\n'.format(word))
print(u'成功从{}个搜狗词库中提取出{}个词组 => {}'.format(len(args.dictfile), len(wordSet), args.output))
12 changes: 9 additions & 3 deletions merge-mmseg-dict.py
Expand Up @@ -8,11 +8,17 @@
'''

import argparse
from itertools import imap, ifilter

# check python version
import sys
if sys.version_info.major == 2:
import itertools
map = itertools.imap
filter = itertools.ifilter

def parse_mmseg_dict(dictPath):
with open(dictPath, 'r') as f:
return set(imap(lambda x: x.strip(), ifilter(lambda x: x.strip()[0] != 'x', f)))
return set(map(lambda x: x.strip(), filter(lambda x: x.strip()[0] != 'x', f)))

if __name__ == '__main__':
argParser = argparse.ArgumentParser(description=u'合并libmmseg中文字典文件,不检查词典文件格式。')
Expand All @@ -36,4 +42,4 @@ def parse_mmseg_dict(dictPath):
print(u'成功合并2个词典文件 {}({}) + {}({}) => {}({})'.format(
args.mainDict, mainWordSetLen, args.secondDict, secondWordSetLen, args.output, numMergedWords))
if (0 != numOmittedWords):
print('{}中的{}个重复词组被忽略'.format(args.secondDict, numOmittedWords))
print(u'{}中的{}个重复词组被忽略'.format(args.secondDict, numOmittedWords))
Binary file added test-data/fishing.scel
Binary file not shown.

0 comments on commit e37be34

Please sign in to comment.