Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 1177 lines (991 sloc) 45.011 kb
a468620 added module for accessing resources in nltk data package
Edward Loper authored
1 # Natural Language Toolkit: Utility functions
2 #
5c523dd @stevenbird Updated NLTK copyright year range from 2001-2010 to 2001-2011
stevenbird authored
3 # Copyright (C) 2001-2011 NLTK Project
a468620 added module for accessing resources in nltk data package
Edward Loper authored
4 # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
0ac13a2 @stevenbird Add www to nltk.org url globally.
stevenbird authored
5 # URL: <http://www.nltk.org/>
a468620 added module for accessing resources in nltk data package
Edward Loper authored
6 # For license information, see LICENSE.TXT
7
8 """
bb4f25f - Added retrieve()
Edward Loper authored
9 Functions to find and load NLTK X{resource files}, such as corpora,
10 grammars, and saved processing objects. Resource files are identified
11 using URLs, such as"C{nltk:corpora/abc/rural.txt}" or
12 "C{http://nltk.org/sample/toy.cfg}". The following URL protocols are
13 supported:
a468620 added module for accessing resources in nltk data package
Edward Loper authored
14
bb4f25f - Added retrieve()
Edward Loper authored
15 - "C{file:I{path}}": Specifies the file whose path is C{I{path}}.
16 Both relative and absolute paths may be used.
17
d57a067 - Fixed a slew of epytext markup errors.
Edward Loper authored
18 - "C{http://I{host}/{path}}": Specifies the file stored on the web
bb4f25f - Added retrieve()
Edward Loper authored
19 server C{I{host}} at path C{I{path}}.
20
21 - "C{nltk:I{path}}": Specifies the file stored in the NLTK data
22 package at C{I{path}}. NLTK will search for these files in the
23 directories specified by L{nltk.data.path}.
24
25 If no protocol is specified, then the default protocol "C{nltk:}" will
26 be used.
27
28 This module provides to functions that can be used to access a
29 resource file, given its URL: L{load()} loads a given resource, and
30 adds it to a resource cache; and L{retrieve()} copies a given resource
31 to a local file.
a468620 added module for accessing resources in nltk data package
Edward Loper authored
32 """
33
a495f7b @stevenbird tweaked import statements for conformance with PEP-8
stevenbird authored
34 import sys
35 import os, os.path
36 import textwrap
37 import weakref
38 import re
4d0c9b9 - Changed to use urllib2 rather than urllib. urllib2 responds in a
Edward Loper authored
39 import urllib2
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
40 import zipfile
41 import codecs
c6f9d85 @stevenbird Applied patch from Joseph Frazee to improve performance for accessing…
stevenbird authored
42
43 from gzip import GzipFile, READ as GZ_READ, WRITE as GZ_WRITE
3515b1c @jfrazee Added GzipFileSystemPathPointer and related modifications for loading
jfrazee authored
44
45 try:
3196f72 @stevenbird Added conditional import of Z_SYNC_FLUSH / Z_FINISH. Resolves issue …
stevenbird authored
46 from zlib import Z_SYNC_FLUSH as FLUSH
47 except:
48 from zlib import Z_FINISH as FLUSH
49
50 try:
3515b1c @jfrazee Added GzipFileSystemPathPointer and related modifications for loading
jfrazee authored
51 import cPickle as pickle
97e5eaf @jfrazee Fixed mistake in imports for cPickle, cStringIO. LoadError doesn't e…
jfrazee authored
52 except:
3515b1c @jfrazee Added GzipFileSystemPathPointer and related modifications for loading
jfrazee authored
53 import pickle
54
55 try:
56 from cStringIO import StringIO
97e5eaf @jfrazee Fixed mistake in imports for cPickle, cStringIO. LoadError doesn't e…
jfrazee authored
57 except:
3515b1c @jfrazee Added GzipFileSystemPathPointer and related modifications for loading
jfrazee authored
58 from StringIO import StringIO
a495f7b @stevenbird tweaked import statements for conformance with PEP-8
stevenbird authored
59
a015b5d Modified nltk-internal imports to import values from the modules wher…
Edward Loper authored
60 import nltk
a468620 added module for accessing resources in nltk data package
Edward Loper authored
61
62 ######################################################################
63 # Search Path
64 ######################################################################
65
66 path = []
67 """A list of directories where the NLTK data package might reside.
68 These directories will be checked in order when looking for a
69 resource in the data package. Note that this allows users to
70 substitute in their own versions of resources, if they have them
ac84e5e @stevenbird Changed ~/nltk/data to ~/nltk_data in docstring for nltk.data.path (c…
stevenbird authored
71 (e.g., in their home directory under ~/nltk_data)."""
a468620 added module for accessing resources in nltk data package
Edward Loper authored
72
73 # User-specified locations:
e76c9e8 @stevenbird doc/en/programming.txt
stevenbird authored
74 path += [d for d in os.environ.get('NLTK_DATA', '').split(os.pathsep) if d]
a468620 added module for accessing resources in nltk data package
Edward Loper authored
75 if os.path.expanduser('~/') != '~/': path += [
cdbdbba @stevenbird Makefile:
stevenbird authored
76 os.path.expanduser('~/nltk_data')]
e76c9e8 @stevenbird doc/en/programming.txt
stevenbird authored
77
a468620 added module for accessing resources in nltk data package
Edward Loper authored
78 # Common locations on Windows:
79 if sys.platform.startswith('win'): path += [
cdbdbba @stevenbird Makefile:
stevenbird authored
80 r'C:\nltk_data', r'D:\nltk_data', r'E:\nltk_data',
81 os.path.join(sys.prefix, 'nltk_data'),
4131e59 @stevenbird Applied patch from Steven Bethard to resolve issue 237.
stevenbird authored
82 os.path.join(sys.prefix, 'lib', 'nltk_data'),
1097c17 APPDATA environment variable may not be defined; if it's not, then do…
Edward Loper authored
83 os.path.join(os.environ.get('APPDATA', 'C:\\'), 'nltk_data')]
a468620 added module for accessing resources in nltk data package
Edward Loper authored
84
85 # Common locations on UNIX & OS X:
86 else: path += [
cdbdbba @stevenbird Makefile:
stevenbird authored
87 '/usr/share/nltk_data',
88 '/usr/local/share/nltk_data',
89 '/usr/lib/nltk_data',
90 '/usr/local/lib/nltk_data']
a468620 added module for accessing resources in nltk data package
Edward Loper authored
91
92 ######################################################################
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
93 # Path Pointers
94 ######################################################################
95
96 class PathPointer(object):
97 """
98 An abstract base class for 'path pointers,' used by NLTK's data
99 package to identify specific paths. Two subclasses exist:
100 L{FileSystemPathPointer} identifies a file that can be accessed
101 directly via a given absolute path. L{ZipFilePathPointer}
102 identifies a file contained within a zipfile, that can be accessed
103 by reading that zipfile.
104 """
105 def open(self, encoding=None):
106 """
107 Return a seekable read-only stream that can be used to read
108 the contents of the file identified by this path pointer.
109
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
110 :raise IOError: If the path specified by this pointer does
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
111 not contain a readable file.
112 """
113 raise NotImplementedError('abstract base class')
114
115 def file_size(self):
116 """
117 Return the size of the file pointed to by this path pointer,
118 in bytes.
119
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
120 :raise IOError: If the path specified by this pointer does
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
121 not contain a readable file.
122 """
123 raise NotImplementedError('abstract base class')
124
125 def join(self, fileid):
126 """
127 Return a new path pointer formed by starting at the path
128 identified by this pointer, and then following the relative
129 path given by C{fileid}. The path components of C{fileid}
130 should be seperated by forward slashes (C{/}), regardless of
131 the underlying file system's path seperator character.
132 """
133 raise NotImplementedError('abstract base class')
134
3515b1c @jfrazee Added GzipFileSystemPathPointer and related modifications for loading
jfrazee authored
135
a610749 @stevenbird Reversed r8276, resolving issue 424.
stevenbird authored
136 class FileSystemPathPointer(PathPointer, str):
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
137 """
138 A path pointer that identifies a file which can be accessed
a610749 @stevenbird Reversed r8276, resolving issue 424.
stevenbird authored
139 directly via a given absolute path. C{FileSystemPathPointer} is a
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
140 subclass of str for backwards compatibility purposes --
a610749 @stevenbird Reversed r8276, resolving issue 424.
stevenbird authored
141 this allows old code that expected C{nltk.data.find()} to expect a
142 string to usually work (assuming the resource is not found in a
143 zipfile). It also permits open() to work on a FileSystemPathPointer.
144
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
145 """
146 def __init__(self, path):
147 """
148 Create a new path pointer for the given absolute path.
149
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
150 :raise IOError: If the given path does not exist.
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
151 """
152 path = os.path.abspath(path)
153 if not os.path.exists(path):
154 raise IOError('No such file or directory: %r' % path)
155 self._path = path
1e17377 fixed google code issue 390
Edward Loper authored
156
157 # There's no need to call str.__init__(), since it's a no-op;
158 # str does all of its setup work in __new__.
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
159
160 path = property(lambda self: self._path, doc="""
161 The absolute path identified by this path pointer.""")
162
163 def open(self, encoding=None):
164 stream = open(self._path, 'rb')
165 if encoding is not None:
166 stream = SeekableUnicodeStreamReader(stream, encoding)
167 return stream
168
169 def file_size(self):
170 return os.stat(self._path).st_size
171
172 def join(self, fileid):
173 path = os.path.join(self._path, *fileid.split('/'))
174 return FileSystemPathPointer(path)
175
176 def __repr__(self):
177 return 'FileSystemPathPointer(%r)' % self._path
178
179 def __str__(self):
180 return self._path
181
3515b1c @jfrazee Added GzipFileSystemPathPointer and related modifications for loading
jfrazee authored
182
c6f9d85 @stevenbird Applied patch from Joseph Frazee to improve performance for accessing…
stevenbird authored
183 class BufferedGzipFile(GzipFile):
184 """
185 A C{GzipFile} subclass that buffers calls to L{read()} and L{write()}.
186 This allows faster reads and writes of data to and from gzip-compressed
187 files at the cost of using more memory.
188
189 The default buffer size is 2mb.
190
191 C{BufferedGzipFile} is useful for loading large gzipped pickle objects
192 as well as writing large encoded feature files for classifier training.
193 """
194 SIZE = 2 * 2**20
195
196 def __init__(self, filename=None, mode=None, compresslevel=9,
197 fileobj=None, **kwargs):
198 """
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
199 :return: a buffered gzip file object
200 :rtype: C{BufferedGzipFile}
201 :param filename: a filesystem path
202 :type filename: str
203 :param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab',
c6f9d85 @stevenbird Applied patch from Joseph Frazee to improve performance for accessing…
stevenbird authored
204 'w', or 'wb'
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
205 :type mode: str
206 :param compresslevel: The compresslevel argument is an integer from 1
c6f9d85 @stevenbird Applied patch from Joseph Frazee to improve performance for accessing…
stevenbird authored
207 to 9 controlling the level of compression; 1 is fastest and
208 produces the least compression, and 9 is slowest and produces the
209 most compression. The default is 9.
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
210 :type compresslevel: int
211 :param fileobj: a StringIO stream to read from instead of a file.
212 :type fileobj: C{StringIO}
213 :param size: number of bytes to buffer during calls to
c6f9d85 @stevenbird Applied patch from Joseph Frazee to improve performance for accessing…
stevenbird authored
214 L{read()} and L{write()}
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
215 :type size: int
c6f9d85 @stevenbird Applied patch from Joseph Frazee to improve performance for accessing…
stevenbird authored
216 """
217 GzipFile.__init__(self, filename, mode, compresslevel, fileobj)
218 self._size = kwargs.get('size', self.SIZE)
219 self._buffer = StringIO()
220 # cStringIO does not support len.
221 self._len = 0
222
223 def _reset_buffer(self):
224 # For some reason calling StringIO.truncate() here will lead to
225 # inconsistent writes so just set _buffer to a new StringIO object.
226 self._buffer = StringIO()
227 self._len = 0
228
229 def _write_buffer(self, data):
230 # Simply write to the buffer and increment the buffer size.
231 if data is not None:
232 self._buffer.write(data)
233 self._len += len(data)
234
235 def _write_gzip(self, data):
236 # Write the current buffer to the GzipFile.
237 GzipFile.write(self, self._buffer.getvalue())
238 # Then reset the buffer and write the new data to the buffer.
239 self._reset_buffer()
240 self._write_buffer(data)
241
242 def close(self):
243 # GzipFile.close() doesn't actuallly close anything.
244 if self.mode == GZ_WRITE:
245 self._write_gzip(None)
246 self._reset_buffer()
247 return GzipFile.close(self)
248
3196f72 @stevenbird Added conditional import of Z_SYNC_FLUSH / Z_FINISH. Resolves issue …
stevenbird authored
249 def flush(self, lib_mode=FLUSH):
c6f9d85 @stevenbird Applied patch from Joseph Frazee to improve performance for accessing…
stevenbird authored
250 self._buffer.flush()
251 GzipFile.flush(self, lib_mode)
252
253 def read(self, size=None):
254 if not size:
255 size = self._size
256 contents = StringIO()
257 while True:
258 blocks = GzipFile.read(self, size)
259 if not blocks:
260 contents.flush()
261 break
262 contents.write(blocks)
263 return contents.getvalue()
264 else:
265 return GzipFile.read(self, size)
266
267 def write(self, data, size=-1):
268 """
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
269 :param data: str to write to file or buffer
270 :type data: str
271 :param size: buffer at least size bytes before writing to file
272 :type size: int
c6f9d85 @stevenbird Applied patch from Joseph Frazee to improve performance for accessing…
stevenbird authored
273 """
274 if not size:
275 size = self._size
276 if self._len + len(data) <= size:
277 self._write_buffer(data)
278 else:
279 self._write_gzip(data)
280
281
3515b1c @jfrazee Added GzipFileSystemPathPointer and related modifications for loading
jfrazee authored
282 class GzipFileSystemPathPointer(FileSystemPathPointer):
283 """
284 A subclass of C{FileSystemPathPointer} that identifies a gzip-compressed
285 file located at a given absolute path. C{GzipFileSystemPathPointer} is
286 appropriate for loading large gzip-compressed pickle objects efficiently.
287 """
288 def open(self, encoding=None):
c6f9d85 @stevenbird Applied patch from Joseph Frazee to improve performance for accessing…
stevenbird authored
289 stream = BufferedGzipFile(self._path, 'rb')
3515b1c @jfrazee Added GzipFileSystemPathPointer and related modifications for loading
jfrazee authored
290 if encoding:
291 stream = SeekableUnicodeStreamReader(stream, encoding)
292 return stream
293
294
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
295 class ZipFilePathPointer(PathPointer):
296 """
297 A path pointer that identifies a file contained within a zipfile,
298 which can be accessed by reading that zipfile.
299 """
300 def __init__(self, zipfile, entry=''):
301 """
302 Create a new path pointer pointing at the specified entry
303 in the given zipfile.
304
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
305 :raise IOError: If the given zipfile does not exist, or if it
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
306 does not contain the specified entry.
307 """
308 if isinstance(zipfile, basestring):
309 zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile))
310
311 # Normalize the entry string:
312 entry = re.sub('(^|/)/+', r'\1', entry)
313
314 # Check that the entry exists:
315 if entry:
316 try: zipfile.getinfo(entry)
800b2e9 fixed ZipFilePathPointer to work if the entry is a directory that is …
Edward Loper authored
317 except:
318 # Sometimes directories aren't explicitly listed in
319 # the zip file. So if `entry` is a directory name,
320 # then check if the zipfile contains any files that
321 # are under the given directory.
322 if (entry.endswith('/') and
323 [n for n in zipfile.namelist() if n.startswith(entry)]):
324 pass # zipfile contains a file in that directory.
325 else:
326 # Otherwise, complain.
327 raise IOError('Zipfile %r does not contain %r' %
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
328 (zipfile.filename, entry))
329 self._zipfile = zipfile
330 self._entry = entry
331
332 zipfile = property(lambda self: self._zipfile, doc="""
333 The C{zipfile.ZipFile} object used to access the zip file
334 containing the entry identified by this path pointer.""")
335 entry = property(lambda self: self._entry, doc="""
336 The name of the file within C{zipfile} that this path
337 pointer points to.""")
338
339 def open(self, encoding=None):
340 data = self._zipfile.read(self._entry)
3515b1c @jfrazee Added GzipFileSystemPathPointer and related modifications for loading
jfrazee authored
341 stream = StringIO(data)
c6f9d85 @stevenbird Applied patch from Joseph Frazee to improve performance for accessing…
stevenbird authored
342 if self._entry.endswith('.gz'):
343 stream = BufferedGzipFile(self._entry, fileobj=stream)
344 elif encoding is not None:
345 stream = SeekableUnicodeStreamReader(stream, encoding)
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
346 return stream
347
348 def file_size(self):
349 return self._zipfile.getinfo(self._entry).file_size
350
351 def join(self, fileid):
352 entry = '%s/%s' % (self._entry, fileid)
353 return ZipFilePathPointer(self._zipfile, entry)
354
355 def __repr__(self):
356 return 'ZipFilePathPointer(%r, %r)' % (
357 self._zipfile.filename, self._entry)
c6f9d85 @stevenbird Applied patch from Joseph Frazee to improve performance for accessing…
stevenbird authored
358
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
359 ######################################################################
a468620 added module for accessing resources in nltk data package
Edward Loper authored
360 # Access Functions
361 ######################################################################
362
1fcfaca Use a normal dictionary for caching nltk.data.load() rather than a we…
Edward Loper authored
363 # Don't use a weak dictionary, because in the common case this
364 # causes a lot more reloading that necessary.
365 _resource_cache = {}
366 """A dictionary used to cache resources so that they won't
a468620 added module for accessing resources in nltk data package
Edward Loper authored
367 need to be loaded more than once."""
368
bb4f25f - Added retrieve()
Edward Loper authored
369 def find(resource_name):
a468620 added module for accessing resources in nltk data package
Edward Loper authored
370 """
da23c74 fixed google code issue 136 -- if nltk.data.find() fails to find a fi…
Edward Loper authored
371 Find the given resource by searching through the directories and
372 zip files in L{nltk.data.path}, and return a corresponding path
373 name. If the given resource is not found, raise a C{LookupError},
374 whose message gives a pointer to the installation instructions for
375 the NLTK downloader.
376
377 Zip File Handling:
378
379 - If C{resource_name} contains a component with a C{.zip}
380 extension, then it is assumed to be a zipfile; and the
381 remaining path components are used to look inside the zipfile.
382
383 - If any element of C{nltk.data.path} has a C{.zip} extension,
384 then it is assumed to be a zipfile.
385
386 - If a given resource name that does not contain any zipfile
387 component is not found initially, then C{find()} will make a
388 second attempt to find that resource, by replacing each
389 component I{p} in the path with I{p.zip/p}. For example, this
390 allows C{find()} to map the resource name
391 C{corpora/chat80/cities.pl} to a zip file path pointer to
392 C{corpora/chat80.zip/chat80/cities.pl}.
393
394 - When using C{find()} to locate a directory contained in a
395 zipfile, the resource name I{must} end with the C{'/'}
396 character. Otherwise, C{find()} will not locate the
397 directory.
a468620 added module for accessing resources in nltk data package
Edward Loper authored
398
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
399 :type resource_name: str
400 :param resource_name: The name of the resource to search for.
bb4f25f - Added retrieve()
Edward Loper authored
401 Resource names are posix-style relative path names, such as
a468620 added module for accessing resources in nltk data package
Edward Loper authored
402 C{'corpora/brown'}. In particular, directory names should
403 always be separated by the C{'/'} character, which will be
404 automatically converted to a platform-appropriate path
405 separator.
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
406 :rtype: str
a468620 added module for accessing resources in nltk data package
Edward Loper authored
407 """
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
408 # Check if the resource name includes a zipfile name
409 m = re.match('(.*\.zip)/?(.*)$|', resource_name)
410 zipfile, zipentry = m.groups()
411
412 # Check each item in our path
413 for path_item in path:
a468620 added module for accessing resources in nltk data package
Edward Loper authored
414
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
415 # Is the path item a zipfile?
416 if os.path.isfile(path_item) and path_item.endswith('.zip'):
417 try: return ZipFilePathPointer(path_item, resource_name)
418 except IOError: continue # resource not in zipfile
419
da23c74 fixed google code issue 136 -- if nltk.data.find() fails to find a fi…
Edward Loper authored
420 # Is the path item a directory?
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
421 elif os.path.isdir(path_item):
422 if zipfile is None:
423 p = os.path.join(path_item, *resource_name.split('/'))
424 if os.path.exists(p):
3515b1c @jfrazee Added GzipFileSystemPathPointer and related modifications for loading
jfrazee authored
425 if p.endswith('.gz'):
426 return GzipFileSystemPathPointer(p)
427 else:
428 return FileSystemPathPointer(p)
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
429 else:
430 p = os.path.join(path_item, *zipfile.split('/'))
431 if os.path.exists(p):
432 try: return ZipFilePathPointer(p, zipentry)
433 except IOError: continue # resource not in zipfile
434
da23c74 fixed google code issue 136 -- if nltk.data.find() fails to find a fi…
Edward Loper authored
435 # Fallback: if the path doesn't include a zip file, then try
436 # again, assuming that one of the path components is inside a
437 # zipfile of the same name.
438 if zipfile is None:
439 pieces = resource_name.split('/')
440 for i in range(len(pieces)):
441 modified_name = '/'.join(pieces[:i]+[pieces[i]+'.zip']+pieces[i:])
442 try: return find(modified_name)
443 except LookupError: pass
444
a468620 added module for accessing resources in nltk data package
Edward Loper authored
445 # Display a friendly error message if the resource wasn't found:
446 msg = textwrap.fill(
e160e30 @stevenbird doc/definitions.rst
stevenbird authored
447 'Resource %r not found. Please use the NLTK Downloader to '
a9ef5c6 @stevenbird nltk/data.py
stevenbird authored
448 'obtain the resource: >>> nltk.download().' %
bb4f25f - Added retrieve()
Edward Loper authored
449 (resource_name,), initial_indent=' ', subsequent_indent=' ',
450 width=66)
a468620 added module for accessing resources in nltk data package
Edward Loper authored
451 msg += '\n Searched in:' + ''.join('\n - %r' % d for d in path)
452 sep = '*'*70
c869d0f added LazyLoader; moved LazyCorpusReader from nltk.corpus.util to nlt…
Edward Loper authored
453 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
454 raise LookupError(resource_not_found)
a468620 added module for accessing resources in nltk data package
Edward Loper authored
455
bb4f25f - Added retrieve()
Edward Loper authored
456 def retrieve(resource_url, filename=None, verbose=True):
457 """
458 Copy the given resource to a local file. If no filename is
459 specified, then use the URL's filename. If there is already a
460 file named C{filename}, then raise a C{ValueError}.
461
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
462 :type resource_url: str
463 :param resource_url: A URL specifying where the resource should be
bb4f25f - Added retrieve()
Edward Loper authored
464 loaded from. The default protocol is C{"nltk:"}, which searches
465 for the file in the the NLTK data package.
a468620 added module for accessing resources in nltk data package
Edward Loper authored
466 """
bb4f25f - Added retrieve()
Edward Loper authored
467 if filename is None:
468 if resource_url.startswith('file:'):
469 filename = os.path.split(filename)[-1]
470 else:
471 filename = re.sub(r'(^\w+:)?.*/', '', resource_url)
472 if os.path.exists(filename):
b7801a8 - minor refactoring of load()
Edward Loper authored
473 filename = os.path.abspath(filename)
474 raise ValueError, "File %r already exists!" % filename
bb4f25f - Added retrieve()
Edward Loper authored
475
476 if verbose:
477 print 'Retrieving %r, saving to %r' % (resource_url, filename)
478
479 # Open the input & output streams.
480 infile = _open(resource_url)
481 outfile = open(filename, 'wb')
482
483 # Copy infile -> outfile, using 64k blocks.
484 while True:
485 s = infile.read(1024*64) # 64k blocks.
486 outfile.write(s)
487 if not s: break
488
489 # Close both files.
490 infile.close()
491 outfile.close()
492
b7801a8 - minor refactoring of load()
Edward Loper authored
493 #: A dictionary describing the formats that are supported by NLTK's
494 #: L{load()} method. Keys are format names, and values are format
495 #: descriptions.
496 FORMATS = {
497 'pickle': "A serialized python object, stored using the pickle module.",
8b40443 @stevenbird nltk/parse/*.py
stevenbird authored
498 'yaml': "A serialized python object, stored using the yaml module.",
cc53acc @stevenbird Found other places where old cfg module was referenced.
stevenbird authored
499 'cfg': "A context free grammar, parsed by nltk.parse_cfg().",
500 'pcfg': "A probabilistic CFG, parsed by nltk.parse_pcfg().",
501 'fcfg': "A feature CFG, parsed by nltk.parse_fcfg().",
b7801a8 - minor refactoring of load()
Edward Loper authored
502 'fol': "A list of first order logic expressions, parsed by "
b0041ed @dhgarrette data.py, data.doctest:
dhgarrette authored
503 "nltk.sem.parse_fol() using nltk.sem.logic.LogicParser.",
504 'logic': "A list of first order logic expressions, parsed by "
505 "nltk.sem.parse_logic(). Requires an additional logic_parser "
506 "parameter",
b7801a8 - minor refactoring of load()
Edward Loper authored
507 'val': "A semantic valuation, parsed by nltk.sem.parse_valuation().",
508 'raw': "The raw (byte string) contents of a file.",
509 }
510
511 #: A dictionary mapping from file extensions to format names, used
512 #: by L{load()} when C{format="auto"} to decide the format for a
513 #: given resource url.
514 AUTO_FORMATS = {
515 'pickle': 'pickle',
516 'yaml': 'yaml',
517 'cfg': 'cfg',
518 'pcfg': 'pcfg',
519 'fcfg': 'fcfg',
520 'fol': 'fol',
b0041ed @dhgarrette data.py, data.doctest:
dhgarrette authored
521 'logic': 'logic',
b7801a8 - minor refactoring of load()
Edward Loper authored
522 'val': 'val'}
523
d5162b3 @dhgarrette Modified so that user can specify a parser for the 'sem' feature of a…
dhgarrette authored
524 def load(resource_url, format='auto', cache=True, verbose=False,
525 logic_parser=None, fstruct_parser=None):
bb4f25f - Added retrieve()
Edward Loper authored
526 """
527 Load a given resource from the NLTK data package. The following
a468620 added module for accessing resources in nltk data package
Edward Loper authored
528 resource formats are currently supported:
529 - C{'pickle'}
530 - C{'yaml'}
9fd72c2 Added support for loading files with '.fol' suffix -- formulas of
Ewan Klein authored
531 - C{'cfg'} (context free grammars)
532 - C{'pcfg'} (probabilistic CFGs)
533 - C{'fcfg'} (feature-based CFGs)
534 - C{'fol'} (formulas of First Order Logic)
b0041ed @dhgarrette data.py, data.doctest:
dhgarrette authored
535 - C{'logic'} (Logical formulas to be parsed by the given logic_parser)
9fd72c2 Added support for loading files with '.fol' suffix -- formulas of
Ewan Klein authored
536 - C{'val'} (valuation of First Order Logic model)
bb4f25f - Added retrieve()
Edward Loper authored
537 - C{'raw'}
a468620 added module for accessing resources in nltk data package
Edward Loper authored
538
539 If no format is specified, C{load()} will attempt to determine a
540 format based on the resource name's file extension. If that
541 fails, C{load()} will raise a C{ValueError} exception.
542
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
543 :type resource_url: str
544 :param resource_url: A URL specifying where the resource should be
bb4f25f - Added retrieve()
Edward Loper authored
545 loaded from. The default protocol is C{"nltk:"}, which searches
546 for the file in the the NLTK data package.
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
547 :type cache: bool
548 :param cache: If true, add this resource to a cache. If C{load}
a468620 added module for accessing resources in nltk data package
Edward Loper authored
549 finds a resource in its cache, then it will return it from the
550 cache rather than loading it. The cache uses weak references,
551 so a resource wil automatically be expunged from the cache
552 when no more objects are using it.
553
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
554 :type verbose: bool
555 :param verbose: If true, print a message when loading a resource.
a468620 added module for accessing resources in nltk data package
Edward Loper authored
556 Messages are not displayed when a resource is retrieved from
557 the cache.
d5162b3 @dhgarrette Modified so that user can specify a parser for the 'sem' feature of a…
dhgarrette authored
558
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
559 :type logic_parser: C{LogicParser}
560 :param logic_parser: The parser that will be used to parse logical
b0041ed @dhgarrette data.py, data.doctest:
dhgarrette authored
561 expressions.
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
562 :type fstruct_parser: C{FeatStructParser}
563 :param fstruct_parser: The parser that will be used to parse the
d5162b3 @dhgarrette Modified so that user can specify a parser for the 'sem' feature of a…
dhgarrette authored
564 feature structure of an fcfg.
a468620 added module for accessing resources in nltk data package
Edward Loper authored
565 """
566 # If we've cached the resource, then just return it.
14b3ccb if cache=False, then don't return values from the cache. (before, ca…
Edward Loper authored
567 if cache:
bb4f25f - Added retrieve()
Edward Loper authored
568 resource_val = _resource_cache.get(resource_url)
14b3ccb if cache=False, then don't return values from the cache. (before, ca…
Edward Loper authored
569 if resource_val is not None:
a027e82 in nltk.data.load, if verbose is true, print a message when using a c…
Edward Loper authored
570 if verbose:
571 print '<<Using cached copy of %s>>' % (resource_url,)
14b3ccb if cache=False, then don't return values from the cache. (before, ca…
Edward Loper authored
572 return resource_val
a468620 added module for accessing resources in nltk data package
Edward Loper authored
573
574 # Let the user know what's going on.
575 if verbose:
bb4f25f - Added retrieve()
Edward Loper authored
576 print '<<Loading %s>>' % (resource_url,)
a468620 added module for accessing resources in nltk data package
Edward Loper authored
577
68c9ca4 added format argument to nltk.data.load()
Edward Loper authored
578 # Determine the format of the resource.
579 if format == 'auto':
3515b1c @jfrazee Added GzipFileSystemPathPointer and related modifications for loading
jfrazee authored
580 resource_url_parts = resource_url.split('.')
581 ext = resource_url_parts[-1]
582 if ext == 'gz':
583 ext = resource_url_parts[-2]
b7801a8 - minor refactoring of load()
Edward Loper authored
584 format = AUTO_FORMATS.get(ext)
585 if format is None:
586 raise ValueError('Could not determine format for %s based '
587 'on its file\nextension; use the "format" '
588 'argument to specify the format explicitly.'
589 % resource_url)
c6f9d85 @stevenbird Applied patch from Joseph Frazee to improve performance for accessing…
stevenbird authored
590
a468620 added module for accessing resources in nltk data package
Edward Loper authored
591 # Load the resource.
68c9ca4 added format argument to nltk.data.load()
Edward Loper authored
592 if format == 'pickle':
bb4f25f - Added retrieve()
Edward Loper authored
593 resource_val = pickle.load(_open(resource_url))
68c9ca4 added format argument to nltk.data.load()
Edward Loper authored
594 elif format == 'yaml':
e2adf2c @stevenbird Cleaned Makefile -- removed reference to old setup-distutils and setu…
stevenbird authored
595 import yaml
bb4f25f - Added retrieve()
Edward Loper authored
596 resource_val = yaml.load(_open(resource_url))
17773cc added *cfg formats
Ewan Klein authored
597 elif format == 'cfg':
a015b5d Modified nltk-internal imports to import values from the modules wher…
Edward Loper authored
598 resource_val = nltk.grammar.parse_cfg(_open(resource_url).read())
17773cc added *cfg formats
Ewan Klein authored
599 elif format == 'pcfg':
a015b5d Modified nltk-internal imports to import values from the modules wher…
Edward Loper authored
600 resource_val = nltk.grammar.parse_pcfg(_open(resource_url).read())
17773cc added *cfg formats
Ewan Klein authored
601 elif format == 'fcfg':
a015b5d Modified nltk-internal imports to import values from the modules wher…
Edward Loper authored
602 resource_val = nltk.grammar.parse_fcfg(_open(resource_url).read(),
d5162b3 @dhgarrette Modified so that user can specify a parser for the 'sem' feature of a…
dhgarrette authored
603 logic_parser=logic_parser,
604 fstruct_parser=fstruct_parser)
9fd72c2 Added support for loading files with '.fol' suffix -- formulas of
Ewan Klein authored
605 elif format == 'fol':
a015b5d Modified nltk-internal imports to import values from the modules wher…
Edward Loper authored
606 resource_val = nltk.sem.parse_logic(_open(resource_url).read(),
607 logic_parser=nltk.sem.logic.LogicParser())
d5162b3 @dhgarrette Modified so that user can specify a parser for the 'sem' feature of a…
dhgarrette authored
608 elif format == 'logic':
a015b5d Modified nltk-internal imports to import values from the modules wher…
Edward Loper authored
609 resource_val = nltk.sem.parse_logic(_open(resource_url).read(),
d5162b3 @dhgarrette Modified so that user can specify a parser for the 'sem' feature of a…
dhgarrette authored
610 logic_parser=logic_parser)
7ad2d47 Support for reading in FOL valuation from a file
Ewan Klein authored
611 elif format == 'val':
a015b5d Modified nltk-internal imports to import values from the modules wher…
Edward Loper authored
612 resource_val = nltk.sem.parse_valuation(_open(resource_url).read())
bb4f25f - Added retrieve()
Edward Loper authored
613 elif format == 'raw':
614 resource_val = _open(resource_url).read()
a468620 added module for accessing resources in nltk data package
Edward Loper authored
615 else:
b7801a8 - minor refactoring of load()
Edward Loper authored
616 assert format not in FORMATS
68c9ca4 added format argument to nltk.data.load()
Edward Loper authored
617 raise ValueError('Unknown format type!')
a468620 added module for accessing resources in nltk data package
Edward Loper authored
618
619 # If requested, add it to the cache.
620 if cache:
bb4f25f - Added retrieve()
Edward Loper authored
621 try:
622 _resource_cache[resource_url] = resource_val
623 except TypeError:
624 # We can't create weak references to some object types, like
625 # strings and tuples. For now, just don't cache them.
626 pass
a468620 added module for accessing resources in nltk data package
Edward Loper authored
627
628 return resource_val
629
1a854d9 Added utility function to display CFGs in text
Ewan Klein authored
630 def show_cfg(resource_url, escape='##'):
631 """
632 Write out a grammar file, ignoring escaped and empty lines
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
633 :type resource_url: str
634 :param resource_url: A URL specifying where the resource should be
1a854d9 Added utility function to display CFGs in text
Ewan Klein authored
635 loaded from. The default protocol is C{"nltk:"}, which searches
636 for the file in the the NLTK data package.
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
637 :type escape: str
638 :param escape: Prepended string that signals lines to be ignored
1a854d9 Added utility function to display CFGs in text
Ewan Klein authored
639 """
640 resource_val = load(resource_url, format='raw', cache=False)
641 lines = resource_val.splitlines()
642 for l in lines:
643 if l.startswith(escape): continue
644 if re.match('^$', l): continue
645 print l
646
647
a468620 added module for accessing resources in nltk data package
Edward Loper authored
648 def clear_cache():
649 """
650 Remove all objects from the resource cache.
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
651 :see: L{load()}
a468620 added module for accessing resources in nltk data package
Edward Loper authored
652 """
653 _resource_cache.clear()
654
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
655 def _open(resource_url):
bb4f25f - Added retrieve()
Edward Loper authored
656 """
657 Helper function that returns an open file object for a resource,
658 given its resource URL. If the given resource URL uses the 'ntlk'
659 protocol, or uses no protocol, then use L{nltk.data.find} to find
660 its path, and open it with the given mode; if the resource URL
661 uses the 'file' protocol, then open the file with the given mode;
4d0c9b9 - Changed to use urllib2 rather than urllib. urllib2 responds in a
Edward Loper authored
662 otherwise, delegate to C{urllib2.urlopen}.
bb4f25f - Added retrieve()
Edward Loper authored
663
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
664 :type resource_url: str
665 :param resource_url: A URL specifying where the resource should be
bb4f25f - Added retrieve()
Edward Loper authored
666 loaded from. The default protocol is C{"nltk:"}, which searches
667 for the file in the the NLTK data package.
668 """
669 # Divide the resource name into "<protocol>:<path>".
670 protocol, path = re.match('(?:(\w+):)?(.*)', resource_url).groups()
671
672 if protocol is None or protocol.lower() == 'nltk':
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
673 return find(path).open()
bb4f25f - Added retrieve()
Edward Loper authored
674 elif protocol.lower() == 'file':
675 # urllib might not use mode='rb', so handle this one ourselves:
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
676 return open(path, 'rb')
bb4f25f - Added retrieve()
Edward Loper authored
677 else:
4d0c9b9 - Changed to use urllib2 rather than urllib. urllib2 responds in a
Edward Loper authored
678 return urllib2.urlopen(resource_url)
bb4f25f - Added retrieve()
Edward Loper authored
679
c869d0f added LazyLoader; moved LazyCorpusReader from nltk.corpus.util to nlt…
Edward Loper authored
680 ######################################################################
681 # Lazy Resource Loader
682 ######################################################################
683
684 class LazyLoader(object):
685 def __init__(self, path):
686 self.__path = path
687
688 def __load(self):
689 resource = load(self.__path)
690 # This is where the magic happens! Transform ourselves into
691 # the object by modifying our own __dict__ and __class__ to
692 # match that of `resource`.
693 self.__dict__ = resource.__dict__
694 self.__class__ = resource.__class__
695
696 def __getattr__(self, attr):
697 self.__load()
698 # This looks circular, but its not, since __load() changes our
699 # __class__ to something new:
700 return getattr(self, attr)
701
b7801a8 - minor refactoring of load()
Edward Loper authored
702 def __repr__(self):
703 self.__load()
704 # This looks circular, but its not, since __load() changes our
705 # __class__ to something new:
706 return '%r' % self
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
707
708 ######################################################################
709 # Open-On-Demand ZipFile
710 ######################################################################
711
712 class OpenOnDemandZipFile(zipfile.ZipFile):
713 """
714 A subclass of C{zipfile.ZipFile} that closes its file pointer
715 whenever it is not using it; and re-opens it when it needs to read
716 data from the zipfile. This is useful for reducing the number of
717 open file handles when many zip files are being accessed at once.
718 C{OpenOnDemandZipFile} must be constructed from a filename, not a
719 file-like object (to allow re-opening). C{OpenOnDemandZipFile} is
720 read-only (i.e., C{write} and C{writestr} are disabled.
721 """
722 def __init__(self, filename):
723 if not isinstance(filename, basestring):
724 raise TypeError('ReopenableZipFile filename must be a string')
725 zipfile.ZipFile.__init__(self, filename)
726 assert self.filename == filename
727 self.close()
728
729 def read(self, name):
730 assert self.fp is None
731 self.fp = open(self.filename, 'rb')
732 value = zipfile.ZipFile.read(self, name)
733 self.close()
734 return value
735
736 def write(self, *args, **kwargs):
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
737 """:raise NotImplementedError: OpenOnDemandZipfile is read-only"""
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
738 raise NotImplementedError('OpenOnDemandZipfile is read-only')
739
740 def writestr(self, *args, **kwargs):
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
741 """:raise NotImplementedError: OpenOnDemandZipfile is read-only"""
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
742 raise NotImplementedError('OpenOnDemandZipfile is read-only')
743
744 def __repr__(self):
745 return 'OpenOnDemandZipFile(%r)' % self.filename
746
747 ######################################################################
748 #{ Seekable Unicode Stream Reader
749 ######################################################################
750
751 class SeekableUnicodeStreamReader(object):
752 """
753 A stream reader that automatically encodes the source byte stream
754 into unicode (like C{codecs.StreamReader}); but still supports the
755 C{seek()} and C{tell()} operations correctly. This is in contrast
756 to C{codecs.StreamReader}, which provide *broken* C{seek()} and
757 C{tell()} methods.
758
759 This class was motivated by L{StreamBackedCorpusView}, which
760 makes extensive use of C{seek()} and C{tell()}, and needs to be
761 able to handle unicode-encoded files.
762
763 Note: this class requires stateless decoders. To my knowledge,
764 this shouldn't cause a problem with any of python's builtin
765 unicode encodings.
766 """
767 DEBUG = True #: If true, then perform extra sanity checks.
768
769 def __init__(self, stream, encoding, errors='strict'):
770 # Rewind the stream to its beginning.
771 stream.seek(0)
772
773 self.stream = stream
774 """The underlying stream."""
775
776 self.encoding = encoding
777 """The name of the encoding that should be used to encode the
778 underlying stream."""
779
780 self.errors = errors
781 """The error mode that should be used when decoding data from
782 the underlying stream. Can be 'strict', 'ignore', or
783 'replace'."""
784
785 self.decode = codecs.getdecoder(encoding)
786 """The function that is used to decode byte strings into
787 unicode strings."""
788
789 self.bytebuffer = ''
790 """A buffer to use bytes that have been read but have not yet
791 been decoded. This is only used when the final bytes from
792 a read do not form a complete encoding for a character."""
793
794 self.linebuffer = None
795 """A buffer used by L{readline()} to hold characters that have
796 been read, but have not yet been returned by L{read()} or
797 L{readline()}. This buffer consists of a list of unicode
798 strings, where each string corresponds to a single line.
799 The final element of the list may or may not be a complete
800 line. Note that the existence of a linebuffer makes the
801 L{tell()} operation more complex, because it must backtrack
802 to the beginning of the buffer to determine the correct
803 file position in the underlying byte stream."""
804
805 self._rewind_checkpoint = 0
806 """The file position at which the most recent read on the
807 underlying stream began. This is used, together with
808 L{_rewind_numchars}, to backtrack to the beginning of
809 L{linebuffer} (which is required by L{tell()})."""
810
811 self._rewind_numchars = None
812 """The number of characters that have been returned since the
813 read that started at L{_rewind_checkpoint}. This is used,
814 together with L{_rewind_checkpoint}, to backtrack to the
815 beginning of L{linebuffer} (which is required by
816 L{tell()})."""
817
818 self._bom = self._check_bom()
819 """The length of the byte order marker at the beginning of
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
820 the stream (or None for no byte order marker)."""
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
821
822 #/////////////////////////////////////////////////////////////////
823 # Read methods
824 #/////////////////////////////////////////////////////////////////
825
826 def read(self, size=None):
827 """
828 Read up to C{size} bytes, decode them using this reader's
829 encoding, and return the resulting unicode string.
830
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
831 :param size: The maximum number of bytes to read. If not
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
832 specified, then read as many bytes as possible.
833
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
834 :rtype: C{unicode}
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
835 """
836 chars = self._read(size)
837
838 # If linebuffer is not empty, then include it in the result
839 if self.linebuffer:
840 chars = ''.join(self.linebuffer) + chars
841 self.linebuffer = None
842 self._rewind_numchars = None
843
844 return chars
845
846 def readline(self, size=None):
847 """
848 Read a line of text, decode it using this reader's encoding,
849 and return the resulting unicode string.
850
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
851 :param size: The maximum number of bytes to read. If no
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
852 newline is encountered before C{size} bytes have been
853 read, then the returned value may not be a complete line
854 of text.
855 """
856 # If we have a non-empty linebuffer, then return the first
857 # line from it. (Note that the last element of linebuffer may
858 # not be a complete line; so let _read() deal with it.)
859 if self.linebuffer and len(self.linebuffer) > 1:
860 line = self.linebuffer.pop(0)
861 self._rewind_numchars += len(line)
862 return line
863
864 readsize = size or 72
865 chars = ''
866
867 # If there's a remaining incomplete line in the buffer, add it.
868 if self.linebuffer:
869 chars += self.linebuffer.pop()
870 self.linebuffer = None
871
872 while True:
873 startpos = self.stream.tell() - len(self.bytebuffer)
874 new_chars = self._read(readsize)
875
876 # If we're at a '\r', then read one extra character, since
877 # it might be a '\n', to get the proper line ending.
878 if new_chars and new_chars.endswith('\r'):
879 new_chars += self._read(1)
880
881 chars += new_chars
882 lines = chars.splitlines(True)
883 if len(lines) > 1:
884 line = lines[0]
885 self.linebuffer = lines[1:]
886 self._rewind_numchars = len(new_chars)-(len(chars)-len(line))
887 self._rewind_checkpoint = startpos
888 break
889 elif len(lines) == 1:
890 line0withend = lines[0]
891 line0withoutend = lines[0].splitlines(False)[0]
892 if line0withend != line0withoutend: # complete line
893 line = line0withend
894 break
895
896 if not new_chars or size is not None:
897 line = chars
898 break
899
900 # Read successively larger blocks of text.
901 if readsize < 8000:
902 readsize *= 2
903
904 return line
905
906 def readlines(self, sizehint=None, keepends=True):
907 """
908 Read this file's contents, decode them using this reader's
909 encoding, and return it as a list of unicode lines.
910
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
911 :rtype: list of C{unicode}
912 :param sizehint: Ignored.
913 :param keepends: If false, then strip newlines.
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
914 """
915 return self.read().splitlines(keepends)
916
917 def next(self):
918 """Return the next decoded line from the underlying stream."""
919 line = self.readline()
920 if line: return line
921 else: raise StopIteration
922
923 def __iter__(self):
924 """Return self"""
925 return self
926
927 def xreadlines(self):
928 """Return self"""
929 return self
930
931 #/////////////////////////////////////////////////////////////////
932 # Pass-through methods & properties
933 #/////////////////////////////////////////////////////////////////
934
935 closed = property(lambda self: self.stream.closed, doc="""
936 True if the underlying stream is closed.""")
937
938 name = property(lambda self: self.stream.name, doc="""
939 The name of the underlying stream.""")
940
941 mode = property(lambda self: self.stream.mode, doc="""
942 The mode of the underlying stream.""")
943
944 def close(self):
945 """
946 Close the underlying stream.
947 """
948 self.stream.close()
949
950 #/////////////////////////////////////////////////////////////////
951 # Seek and tell
952 #/////////////////////////////////////////////////////////////////
953
954 def seek(self, offset, whence=0):
955 """
956 Move the stream to a new file position. If the reader is
957 maintaining any buffers, tehn they will be cleared.
958
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
959 :param offset: A byte count offset.
960 :param whence: If C{whence} is 0, then the offset is from the
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
961 start of the file (offset should be positive). If
962 C{whence} is 1, then the offset is from the current
963 position (offset may be positive or negative); and if 2,
964 then the offset is from the end of the file (offset should
965 typically be negative).
966 """
967 if whence == 1:
968 raise ValueError('Relative seek is not supported for '
969 'SeekableUnicodeStreamReader -- consider '
970 'using char_seek_forward() instead.')
971 self.stream.seek(offset, whence)
972 self.linebuffer = None
973 self.bytebuffer = ''
974 self._rewind_numchars = None
975 self._rewind_checkpoint = self.stream.tell()
976
977 def char_seek_forward(self, offset):
978 """
979 Move the read pointer forward by C{offset} characters.
980 """
981 if offset < 0:
982 raise ValueError('Negative offsets are not supported')
983 # Clear all buffers.
984 self.seek(self.tell())
985 # Perform the seek operation.
986 self._char_seek_forward(offset)
987
988 def _char_seek_forward(self, offset, est_bytes=None):
989 """
990 Move the file position forward by C{offset} characters,
991 ignoring all buffers.
992
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
993 :param est_bytes: A hint, giving an estimate of the number of
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
994 bytes that will be neded to move foward by C{offset} chars.
995 Defaults to C{offset}.
996 """
997 if est_bytes is None: est_bytes = offset
998 bytes = ''
999
1000 while True:
1001 # Read in a block of bytes.
1002 newbytes = self.stream.read(est_bytes-len(bytes))
1003 bytes += newbytes
1004
1005 # Decode the bytes to characters.
1006 chars, bytes_decoded = self._incr_decode(bytes)
1007
1008 # If we got the right number of characters, then seek
1009 # backwards over any truncated characters, and return.
1010 if len(chars) == offset:
1011 self.stream.seek(-len(bytes)+bytes_decoded, 1)
1012 return
1013
1014 # If we went too far, then we can back-up until we get it
1015 # right, using the bytes we've already read.
1016 if len(chars) > offset:
1017 while len(chars) > offset:
1018 # Assume at least one byte/char.
1019 est_bytes += offset-len(chars)
1020 chars, bytes_decoded = self._incr_decode(bytes[:est_bytes])
1021 self.stream.seek(-len(bytes)+bytes_decoded, 1)
1022 return
1023
1024 # Otherwise, we haven't read enough bytes yet; loop again.
1025 est_bytes += offset - len(chars)
1026
1027 def tell(self):
1028 """
1029 Return the current file position on the underlying byte
1030 stream. If this reader is maintaining any buffers, then the
1031 returned file position will be the position of the beginning
1032 of those buffers.
1033 """
1034 # If nothing's buffered, then just return our current filepos:
1035 if self.linebuffer is None:
1036 return self.stream.tell() - len(self.bytebuffer)
1037
1038 # Otherwise, we'll need to backtrack the filepos until we
1039 # reach the beginning of the buffer.
1040
1041 # Store our original file position, so we can return here.
1042 orig_filepos = self.stream.tell()
1043
1044 # Calculate an estimate of where we think the newline is.
1045 bytes_read = ( (orig_filepos-len(self.bytebuffer)) -
1046 self._rewind_checkpoint )
1047 buf_size = sum([len(line) for line in self.linebuffer])
1048 est_bytes = (bytes_read * self._rewind_numchars /
1049 (self._rewind_numchars + buf_size))
1050
1051 self.stream.seek(self._rewind_checkpoint)
1052 self._char_seek_forward(self._rewind_numchars, est_bytes)
1053 filepos = self.stream.tell()
1054
1055 # Sanity check
1056 if self.DEBUG:
1057 self.stream.seek(filepos)
1058 check1 = self._incr_decode(self.stream.read(50))[0]
1059 check2 = ''.join(self.linebuffer)
1060 assert check1.startswith(check2) or check2.startswith(check1)
1061
1062 # Return to our original filepos (so we don't have to throw
1063 # out our buffer.)
1064 self.stream.seek(orig_filepos)
1065
1066 # Return the calculated filepos
1067 return filepos
1068
1069 #/////////////////////////////////////////////////////////////////
1070 # Helper methods
1071 #/////////////////////////////////////////////////////////////////
1072
1073 def _read(self, size=None):
1074 """
1075 Read up to C{size} bytes from the underlying stream, decode
1076 them using this reader's encoding, and return the resulting
1077 unicode string. C{linebuffer} is *not* included in the
1078 result.
1079 """
1080 if size == 0: return u''
1081
1082 # Skip past the byte order marker, if present.
1083 if self._bom and self.stream.tell() == 0:
1084 self.stream.read(self._bom)
1085
1086 # Read the requested number of bytes.
1087 if size is None:
1088 new_bytes = self.stream.read()
1089 else:
1090 new_bytes = self.stream.read(size)
1091 bytes = self.bytebuffer + new_bytes
1092
1093 # Decode the bytes into unicode characters
1094 chars, bytes_decoded = self._incr_decode(bytes)
1095
1096 # If we got bytes but couldn't decode any, then read further.
1097 if (size is not None) and (not chars) and (len(new_bytes) > 0):
1098 while not chars:
1099 new_bytes = self.stream.read(1)
1100 if not new_bytes: break # end of file.
1101 bytes += new_bytes
1102 chars, bytes_decoded = self._incr_decode(bytes)
1103
1104 # Record any bytes we didn't consume.
1105 self.bytebuffer = bytes[bytes_decoded:]
1106
1107 # Return the result
1108 return chars
1109
1110 def _incr_decode(self, bytes):
1111 """
1112 Decode the given byte string into a unicode string, using this
1113 reader's encoding. If an exception is encountered that
1114 appears to be caused by a truncation error, then just decode
1115 the byte string without the bytes that cause the trunctaion
1116 error.
1117
2608224 @stevenbird overhaul of import statements; some work on updating docstring format
stevenbird authored
1118 :return: A tuple C{(chars, num_consumed)}, where C{chars} is
a7c0e45 Merged zipdata branch changes r6304:6324 into the trunk. This branch…
Edward Loper authored
1119 the decoded unicode string, and C{num_consumed} is the
1120 number of bytes that were consumed.
1121 """
1122 while True:
1123 try:
1124 return self.decode(bytes, 'strict')
1125 except UnicodeDecodeError, exc:
1126 # If the exception occurs at the end of the string,
1127 # then assume that it's a truncation error.
1128 if exc.end == len(bytes):
1129 return self.decode(bytes[:exc.start], self.errors)
1130
1131 # Otherwise, if we're being strict, then raise it.
1132 elif self.errors == 'strict':
1133 raise
1134
1135 # If we're not strcit, then re-process it with our
1136 # errors setting. This *may* raise an exception.
1137 else:
1138 return self.decode(bytes, self.errors)
1139
1140 _BOM_TABLE = {
1141 'utf8': [(codecs.BOM_UTF8, None)],
1142 'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'),
1143 (codecs.BOM_UTF16_BE, 'utf16-be')],
1144 'utf16le': [(codecs.BOM_UTF16_LE, None)],
1145 'utf16be': [(codecs.BOM_UTF16_BE, None)],
1146 'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'),
1147 (codecs.BOM_UTF32_BE, 'utf32-be')],
1148 'utf32le': [(codecs.BOM_UTF32_LE, None)],
1149 'utf32be': [(codecs.BOM_UTF32_BE, None)],
1150 }
1151
1152 def _check_bom(self):
1153 # Normalize our encoding name
1154 enc = re.sub('[ -]', '', self.encoding.lower())
1155
1156 # Look up our encoding in the BOM table.
1157 bom_info = self._BOM_TABLE.get(enc)
1158
1159 if bom_info:
1160 # Read a prefix, to check against the BOM(s)
1161 bytes = self.stream.read(16)
1162 self.stream.seek(0)
1163
1164 # Check for each possible BOM.
1165 for (bom, new_encoding) in bom_info:
1166 if bytes.startswith(bom):
1167 if new_encoding: self.encoding = new_encoding
1168 return len(bom)
1169
1170 return None
1171
facccdd @stevenbird Changes to improve speed of "import nltk".
stevenbird authored
1172 __all__ = ['path', 'PathPointer', 'FileSystemPathPointer', 'BufferedGzipFile',
1173 'GzipFileSystemPathPointer', 'GzipFileSystemPathPointer',
1174 'find', 'retrieve', 'FORMATS', 'AUTO_FORMATS', 'load',
1175 'show_cfg', 'clear_cache', 'LazyLoader', 'OpenOnDemandZipFile',
1176 'GzipFileSystemPathPointer', 'SeekableUnicodeStreamReader']
Something went wrong with that request. Please try again.