/
reader.py
93 lines (77 loc) · 2.71 KB
/
reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
from depccg.combinator import UnaryRule
from depccg.lang import ja_default_binary_rules
from depccg.cat import Category
from depccg.tree import Tree
from depccg.tokens import Token
combinators = {sign: rule for rule, sign in zip(
ja_default_binary_rules,
['SSEQ', '>', '<', '>B', '<B1', '<B2', '<B3', '<B4', '>Bx1', '>Bx2', '>Bx3'])
}
for sign in ['ADNext', 'ADNint', 'ADV0', 'ADV1', 'ADV2']:
combinators[sign] = UnaryRule()
DEPENDENCY = re.compile(r'{.+?}')
def read_ccgbank(filepath):
for i, line in enumerate(open(filepath)):
line = line.strip()
if len(line) == 0:
continue
tree, tokens = _JaCCGLineReader(line).parse()
yield str(i), tokens, tree
class _JaCCGLineReader(object):
def __init__(self, line):
self.lang = 'ja'
self.line = line
self.index = 0
self.word_id = -1
self.tokens = []
def next(self, target):
end = self.line.find(target, self.index)
res = self.line[self.index:end]
self.index = end + 1
return res
def check(self, text, offset=0):
if self.line[self.index + offset] != text:
raise RuntimeError('AutoLineReader.check catches parse error')
def peek(self):
return self.line[self.index]
def parse(self):
res = self.next_node()
return res, self.tokens
@property
def next_node(self):
end = self.line.find(' ', self.index)
if self.line[self.index+1:end] in combinators:
return self.parse_tree
else:
return self.parse_leaf
def parse_leaf(self):
self.word_id += 1
self.check('{')
cat = self.next(' ')[1:]
cat = cat[:cat.find('_')]
cat = DEPENDENCY.sub('', cat)
cat = Category.parse(cat)
surf, base, pos1, pos2 = self.next('}')[:-1].split('/')
token = Token(surf=surf, base=base, pos1=pos1, pos2=pos2)
self.tokens.append(token)
return Tree.make_terminal(surf, cat, self.lang)
def parse_tree(self):
self.check('{')
op = self.next(' ')
op = combinators[op[1:]]
cat = DEPENDENCY.sub('', self.next(' '))
cat = Category.parse(cat)
self.check('{')
children = []
while self.peek() != '}':
children.append(self.next_node())
if self.peek() == ' ':
self.next(' ')
self.next('}')
if len(children) == 1:
return Tree.make_unary(cat, children[0], self.lang)
else:
assert len(children) == 2, f'failed to parse, invalid number of children: {self.line}'
left, right = children
return Tree.make_binary(cat, left, right, op, self.lang)