-
Notifications
You must be signed in to change notification settings - Fork 2.8k
/
stanford_segmenter.py
292 lines (253 loc) · 9.25 KB
/
stanford_segmenter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/env python
# Natural Language Toolkit: Interface to the Stanford Segmenter
# for Chinese and Arabic
#
# Copyright (C) 2001-2021 NLTK Project
# Author: 52nlp <52nlpcn@gmail.com>
# Casper Lehmann-Strøm <casperlehmann@gmail.com>
# Alex Constantin <alex@keyworder.ch>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import json
import os
import tempfile
import warnings
from subprocess import PIPE
from nltk.internals import (
_java_options,
config_java,
find_dir,
find_file,
find_jar,
java,
)
from nltk.tokenize.api import TokenizerI
_stanford_url = "https://nlp.stanford.edu/software"
class StanfordSegmenter(TokenizerI):
"""Interface to the Stanford Segmenter
If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
should be provieded, for example::
seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')
>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
>>> seg = StanfordSegmenter()
>>> seg.default_config('zh')
>>> sent = u'这是斯坦福中文分词器测试'
>>> print(seg.segment(sent))
\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5
<BLANKLINE>
>>> seg.default_config('ar')
>>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
>>> print(seg.segment(sent.split()))
\u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a
<BLANKLINE>
"""
_JAR = "stanford-segmenter.jar"
def __init__(
self,
path_to_jar=None,
path_to_slf4j=None,
java_class=None,
path_to_model=None,
path_to_dict=None,
path_to_sihan_corpora_dict=None,
sihan_post_processing="false",
keep_whitespaces="false",
encoding="UTF-8",
options=None,
verbose=False,
java_options="-mx2g",
):
# Raise deprecation warning.
warnings.simplefilter("always", DeprecationWarning)
warnings.warn(
str(
"\nThe StanfordTokenizer will "
"be deprecated in version 3.2.5.\n"
"Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"
),
DeprecationWarning,
stacklevel=2,
)
warnings.simplefilter("ignore", DeprecationWarning)
stanford_segmenter = find_jar(
self._JAR,
path_to_jar,
env_vars=("STANFORD_SEGMENTER",),
searchpath=(),
url=_stanford_url,
verbose=verbose,
)
if path_to_slf4j is not None:
slf4j = find_jar(
"slf4j-api.jar",
path_to_slf4j,
env_vars=("SLF4J", "STANFORD_SEGMENTER"),
searchpath=(),
url=_stanford_url,
verbose=verbose,
)
else:
slf4j = None
# This is passed to java as the -cp option, the old version of segmenter needs slf4j.
# The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
self._stanford_jar = os.pathsep.join(
_ for _ in [stanford_segmenter, slf4j] if _ is not None
)
self._java_class = java_class
self._model = path_to_model
self._sihan_corpora_dict = path_to_sihan_corpora_dict
self._sihan_post_processing = sihan_post_processing
self._keep_whitespaces = keep_whitespaces
self._dict = path_to_dict
self._encoding = encoding
self.java_options = java_options
options = {} if options is None else options
self._options_cmd = ",".join(
f"{key}={json.dumps(val)}" for key, val in options.items()
)
def default_config(self, lang):
"""
Attempt to initialize Stanford Word Segmenter for the specified language
using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
"""
search_path = ()
if os.environ.get("STANFORD_SEGMENTER"):
search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")}
# init for Chinese-specific files
self._dict = None
self._sihan_corpora_dict = None
self._sihan_post_processing = "false"
if lang == "ar":
self._java_class = (
"edu.stanford.nlp.international.arabic.process.ArabicSegmenter"
)
model = "arabic-segmenter-atb+bn+arztrain.ser.gz"
elif lang == "zh":
self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier"
model = "pku.gz"
self._sihan_post_processing = "true"
path_to_dict = "dict-chris6.ser.gz"
try:
self._dict = find_file(
path_to_dict,
searchpath=search_path,
url=_stanford_url,
verbose=False,
env_vars=("STANFORD_MODELS",),
)
except LookupError as e:
raise LookupError(
"Could not find '%s' (tried using env. "
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)"
% path_to_dict
) from e
sihan_dir = "./data/"
try:
path_to_sihan_dir = find_dir(
sihan_dir,
url=_stanford_url,
verbose=False,
env_vars=("STANFORD_SEGMENTER",),
)
self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
except LookupError as e:
raise LookupError(
"Could not find '%s' (tried using the "
"STANFORD_SEGMENTER environment variable)" % sihan_dir
) from e
else:
raise LookupError(f"Unsupported language {lang}")
try:
self._model = find_file(
model,
searchpath=search_path,
url=_stanford_url,
verbose=False,
env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"),
)
except LookupError as e:
raise LookupError(
"Could not find '%s' (tried using env. "
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model
) from e
def tokenize(self, s):
super().tokenize(s)
def segment_file(self, input_file_path):
""" """
cmd = [
self._java_class,
"-loadClassifier",
self._model,
"-keepAllWhitespaces",
self._keep_whitespaces,
"-textFile",
input_file_path,
]
if self._sihan_corpora_dict is not None:
cmd.extend(
[
"-serDictionary",
self._dict,
"-sighanCorporaDict",
self._sihan_corpora_dict,
"-sighanPostProcessing",
self._sihan_post_processing,
]
)
stdout = self._execute(cmd)
return stdout
def segment(self, tokens):
return self.segment_sents([tokens])
def segment_sents(self, sentences):
""" """
encoding = self._encoding
# Create a temporary input file
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
# Write the actural sentences to the temporary input file
_input_fh = os.fdopen(_input_fh, "wb")
_input = "\n".join(" ".join(x) for x in sentences)
if isinstance(_input, str) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
cmd = [
self._java_class,
"-loadClassifier",
self._model,
"-keepAllWhitespaces",
self._keep_whitespaces,
"-textFile",
self._input_file_path,
]
if self._sihan_corpora_dict is not None:
cmd.extend(
[
"-serDictionary",
self._dict,
"-sighanCorporaDict",
self._sihan_corpora_dict,
"-sighanPostProcessing",
self._sihan_post_processing,
]
)
stdout = self._execute(cmd)
# Delete the temporary file
os.unlink(self._input_file_path)
return stdout
def _execute(self, cmd, verbose=False):
encoding = self._encoding
cmd.extend(["-inputEncoding", encoding])
_options_cmd = self._options_cmd
if _options_cmd:
cmd.extend(["-options", self._options_cmd])
default_options = " ".join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
stdout, _stderr = java(
cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
)
stdout = stdout.decode(encoding)
# Return java configurations to their default values.
config_java(options=default_options, verbose=False)
return stdout