Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add script naer_seg and modify default model path
- Loading branch information
Ming-Hong Bai
committed
Mar 18, 2016
1 parent
047c8b7
commit c4e9e98
Showing
9 changed files
with
235 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,6 @@ | ||
*.pyc | ||
*.pyo | ||
Data/ | ||
*.swp | ||
Segmentor/Data/ | ||
build/ | ||
tmp/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,5 @@ | ||
# Segmentor | ||
|
||
下載 Segmentor 的 model: | ||
|
||
<http://120.127.233.228/download/> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
#!/usr/bin/python | ||
#-*- encoding: UTF-8 -*- | ||
# | ||
# NAER Segmentor - | ||
# Copyright (c) 2016 National Academy for Educational Research | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
# | ||
|
||
from optparse import OptionParser | ||
from Segmentor import * | ||
import string | ||
import os,sys | ||
import json | ||
import locale | ||
import re | ||
|
||
def parseArg(): | ||
parser = OptionParser(usage='Usage: %prog [options] input_file1[::output_file1] ...') | ||
|
||
parser.add_option('-s','--suffix', action='store', type='string', | ||
dest='suffix', help='suffix for output file', default='.seg') | ||
|
||
parser.add_option('-m','--model', action='store', type='string', | ||
dest='model', help='directory of models for word segmention and POS tagging', default=None) | ||
|
||
parser.add_option('-p','--postag', action='store_true', | ||
dest='postag', help='POS tagging switch', default=False) | ||
|
||
parser.add_option('-n','--disable-segment', action='store_false', | ||
dest='segment', help='segmentation switch', default=True) | ||
|
||
parser.add_option('-b','--boundary', action='store', type='string', | ||
dest='boundary', help='word boundary', default=u" ") | ||
|
||
parser.add_option('-f', '--format', action='store', type='string', | ||
dest='format', help='output format for a tagged word', default="%s(%s)") | ||
|
||
parser.add_option('--output-dir', action='store', type='string', | ||
dest='outputdir', help='save output files to specific directory', default="") | ||
|
||
parser.add_option('-l', '--list', action='store', type='string', | ||
dest='list', help='read input file list from file', default="") | ||
|
||
parser.add_option('-v', '--verbose', action='store_true', | ||
dest='verbose', help='enable verbose mode', default=False) | ||
|
||
parser.add_option('-e','--encoding', action='store', type='string', | ||
dest='encoding', help='set input and output encoding', default="UTF-8") | ||
|
||
parser.add_option('--region', action='append', type='string', | ||
dest='region', help='set processing region. e.g. --region="<chtitle>::</chtitle>" will segment text between <chtitle> and </chtitle> tags. "::" is the separator of the start and end tags.', default=[]) | ||
|
||
parser.add_option('--mask', action='append', type='string', | ||
dest='mask', help='set mask region which will not be processed. e.g. --mask="<[^>]+>" will prevent html tags, such as <font size="12">, to be segmented.', default=[]) | ||
|
||
parser.add_option('-D','--directory', action='append', type='string', | ||
dest='directory', help='set input (and output) directory. e.g. --directory="dir1::dir2" will process files in dir1 and output to dir2.', default=[]) | ||
|
||
parser.add_option('--exclude', action='store', type='string', | ||
dest='exclude', help='set exclude regular expression.', default="") | ||
|
||
parser.add_option('--include', action='store', type='string', | ||
dest='include', help='set include regular expression.', default="") | ||
|
||
parser.add_option('-R','--recursive', action='store_true', | ||
dest='recursive', help='set exclude regular expression.', default=False) | ||
|
||
parser.add_option('--line', action='store_true', | ||
dest='line', help='process data line by line.', default=False) | ||
|
||
(options, args)=parser.parse_args() | ||
|
||
loc_encoding=locale.getdefaultlocale()[1] | ||
|
||
options.boundary=options.boundary.decode("string_escape").decode(loc_encoding) | ||
options.format=options.format.decode("string_escape").decode(loc_encoding) | ||
options.region=[x.decode("string_escape").decode(loc_encoding) for x in options.region] | ||
options.mask=[x.decode("string_escape").decode(loc_encoding) for x in options.mask] | ||
|
||
if len(args)==0 and len(options.list)==0 and len(options.directory)==0: | ||
parser.error('input file needed!') | ||
|
||
if len(options.outputdir)>0: | ||
if not os.path.isdir(options.outputdir): | ||
parser.error('directory "%s" not found!') | ||
|
||
if len(options.list) >0: | ||
if options.list!="-" and not os.path.exists(options.list): | ||
parser.error('list file "%s" not found!') | ||
|
||
if (not options.segment) and (not options.postag): | ||
parser.error('segment and postag cannot be disabled in the same time!') | ||
|
||
|
||
return (options, args) | ||
|
||
if __name__=="__main__": | ||
|
||
(options, args)=parseArg() | ||
|
||
if options.verbose: | ||
sys.stderr.write("Loading word segmentation model ... ") | ||
|
||
if options.model: | ||
segmentor=FileSegmentor(model_dir=options.model, segment=options.segment, postag=options.postag) | ||
else: | ||
segmentor=FileSegmentor(segment=options.segment, postag=options.postag) | ||
|
||
if options.verbose: | ||
sys.stderr.write('done.\n') | ||
|
||
segmentor.setBoundary(options.boundary) | ||
segmentor.setFormat(options.format) | ||
|
||
if len(options.region)>0: | ||
segmentor.setRegion(region_list=options.region) | ||
|
||
if len(options.mask)>0: | ||
segmentor.setMask(mask_list=options.mask) | ||
|
||
if options.outputdir: | ||
segmentor.setOutputDir(options.outputdir) | ||
|
||
if options.suffix: | ||
segmentor.setSuffix(options.suffix) | ||
|
||
if options.line: | ||
segmentor.line_mode=True | ||
|
||
for filename in args: | ||
if filename=="-": | ||
segmentor.procFile(sys.stdin, sys.stdout) | ||
else: | ||
segmentor.procFile(filename) | ||
|
||
if len(options.list)>0: | ||
if options.list=="-": | ||
list_f=sys.stdin | ||
else: | ||
list_f=open(options.list) | ||
|
||
for filename in list_f: | ||
filename=filename.strip(" \r\n") | ||
if options.verbose: | ||
sys.stderr.write('Processing "%s" ... '%(filename)) | ||
|
||
if os.path.isfile(filename): | ||
segmentor.procFile(filename) | ||
if options.verbose: | ||
sys.stderr.write('done.\n') | ||
elif options.verbose: | ||
sys.stderr.write('is a directory, skiped.\n') | ||
|
||
if len(options.directory)>0: | ||
for item in options.directory: | ||
L=item.split('::') | ||
exclude=None | ||
if len(options.exclude)>0: | ||
exclude=options.exclude | ||
include=None | ||
if len(options.include)>0: | ||
include=options.include | ||
if len(L)==1: | ||
segmentor.procDir(item, exclude=exclude, include=include, | ||
recursive=options.recursive, verbose=options.verbose) | ||
else: | ||
segmentor.procDir(L[0],L[1], exclude=exclude, include=include, | ||
recursive=options.recursive, verbose=options.verbose) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#-*- coding:utf8 -*- | ||
# | ||
# This file is part of the NAER Segmentor - | ||
# Copyright (c) 2016 National Academy for Educational Research | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
|
||
import sys | ||
from Segmentor import * | ||
|
||
|
||
sys.stdout.write("Loading NAER Segmentor ... ") | ||
segmentor=FileSegmentor(postag=True) | ||
sys.stdout.write(" done.\n") | ||
|
||
def getSegResult(Info): | ||
input_doc = Info["RawText"] | ||
result=segmentor.procDoc(input_doc) | ||
return result | ||
|
||
if __name__=="__main__": | ||
|
||
RawText=u'''\ | ||
市面上很少有「教科書設計」的專書,因為我們總覺得那是出版社的事! | ||
然而,真的是這樣嗎? | ||
教科書設計其實與課程綱要、教師的教學、學生的學習息息相關,是課程、教學、學習三位一體間一個重要的環節,除了有教育學與學科專業等內容涵納其中,也與編輯、版式等視覺設計元素的概念有關。 | ||
有鑒於此議題的重要,本院教科書發展中心邀請淡江大學課程與教學研究所陳麗華所長,於8月27日上午進行「教科書設計研究」專題演講,除了院內同仁,也邀請出版社編輯企劃相關人員參與。''' | ||
|
||
result=getSegResult({"RawText":RawText}) | ||
print result |