-
Notifications
You must be signed in to change notification settings - Fork 41
/
interpretation.py
154 lines (127 loc) · 4.13 KB
/
interpretation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# coding: utf-8
from __future__ import unicode_literals
# TODO: using raw python version due to jellyfish#issues/55
from jellyfish._jellyfish import damerau_levenshtein_distance, levenshtein_distance
from yargy.compat import str
from yargy.normalization import get_normalized_text
def choice_best_span(a, b):
if not a and not b:
return None
if not a and b:
return b
if not b and a:
return a
if isinstance(a, list) and isinstance(b, list):
if len(a) >= len(b):
return a
else:
return b
if isinstance(a, list) or isinstance(b, list):
if isinstance(a, list):
return a
else:
return b
if len(a.value) == 1:
return b
if len(b.value) == 1:
return a
if len(a.value) > len(b.value):
return a
else:
return b
if len(a.forms) > len(b.forms):
return b
else:
return a
class InterpretationObject(object):
'''
Base class for object interpretation
'''
Attributes = None
SIMILARITY_THRESHOLD = 2
def __init__(self, **kwargs):
for key in self.Attributes.__members__.keys():
# set default values for object attributes
self.__dict__[key.lower()] = None
for key, value in kwargs.items():
self.__dict__[key] = value
@property
def abbr(self):
abbr = set()
for span in self.spans:
if len(span) > 1:
abbr |= {
''.join(str(x.value)[0].lower() for x in span),
}
else:
abbr |= {
str(span[0].value).lower(),
}
return abbr
@property
def normalized(self):
return get_normalized_text(
self.spans[0],
).lower()
def difference(self, another):
return damerau_levenshtein_distance(
self.normalized,
another.normalized,
)
def __repr__(self):
return '{cls}({attrs})'.format(
cls=self.__class__.__name__,
attrs=self.__dict__,
)
def __iter__(self):
for k, v in self.__dict__.items():
yield k, v
def __eq__(self, another):
if isinstance(another, self.__class__):
a, b = self.normalized, another.normalized
if b > a:
a, b = b, a
if b in a or a.startswith(b):
return True
if self.abbr & another.abbr:
return True
if self.difference(another) <= self.SIMILARITY_THRESHOLD:
return True
return False
class InterpretationEngine(object):
'''
This class creates objects from text spans
'''
def __init__(self, object_class):
self.object_class = object_class
def extract(self, matches):
for _, tokens in matches:
fields = {}
for token in tokens:
if token.interpretation:
attribute = token.interpretation['attribute']
if isinstance(attribute, list):
required_attribute = None
for field in attribute:
if field in self.object_class.Attributes:
required_attribute = field
if not required_attribute:
continue
attribute = required_attribute
else:
if not attribute in self.object_class.Attributes:
continue
name = attribute.name.lower()
if fields.get(name, None):
value = fields[name]
if isinstance(value, list):
value.append(token)
else:
fields[name] = [value, token]
else:
fields[name] = token
if fields:
fields['spans'] = [
tokens
]
yield self.object_class(**fields)