/
genesis.py
76 lines (62 loc) · 2.18 KB
/
genesis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Natural Language Toolkit: Genesis Corpus Reader
#
# Copyright (C) 2001-2007 University of Pennsylvania
# Author: Steven Bird <sb@ldc.upenn.edu>
# Edward Loper <edloper@gradient.cis.upenn.edu>
# URL: <http://nltk.sf.net>
# For license information, see LICENSE.TXT
"""
The Genesis Corpus.
This corpus has been prepared from several web sources; formatting,
markup and verse numbers have been stripped.
english-kjv - Genesis, King James version (Project Gutenberg)
english-web - Genesis, World English Bible (Project Gutenberg)
french - Genesis, Louis Segond 1910
german - Genesis, Luther Translation
swedish - Genesis, Gamla och Nya Testamentet, 1917 (Project Runeberg)
finnish - Genesis, Suomen evankelis-luterilaisen kirkon kirkolliskokouksen vuonna 1992 kayttoon ottama suomennos
"""
from util import *
from nltk import tokenize
import os
items = [
'english-kjv',
'english-web',
'french',
'german',
'swedish',
'finnish']
item_name = {
'english-kjv': 'Genesis, King James version (Project Gutenberg)',
'english-web': 'Genesis, World English Bible (Project Gutenberg)',
'french': 'Genesis, Louis Segond 1910',
'german': 'Genesis, Luther Translation',
'swedish': 'Genesis, Gamla och Nya Testamentet, 1917 (Project Runeberg)',
'finnish': 'Genesis, Suomen evankelis-luterilaisen kirkon kirkolliskokouksen vuonna 1992 kayttoon ottama suomennos'
}
def raw(files = 'english-kjv'):
"""
@param files: One or more treebank files to be processed
@type files: L{string} or L{tuple(string)}
@rtype: iterator over L{tree}
"""
# Just one file to process? If so convert to a tuple so we can iterate
if type(files) is str: files = (files,)
for file in files:
path = os.path.join(get_basedir(), "genesis", file+".txt")
f = open_corpus(path)
for t in tokenize.whitespace(f.read()):
yield t
def demo():
from nltk.corpora import genesis
from itertools import islice
print 'English:'
for word in islice(genesis.raw(), 27):
print word,
print
print 'Finnish:'
for word in islice(genesis.raw('finnish'), 27):
print word,
print
if __name__ == '__main__':
demo()