Skip to content

Commit

Permalink
#495 supported nested entities in 0.24.0
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Aug 19, 2023
1 parent 8c1635d commit 8c1ee11
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 47 deletions.
3 changes: 2 additions & 1 deletion arekit/common/docs/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@

class DocumentEntity(Entity):

def __init__(self, value, display_value, e_type, id_in_doc, group_index):
def __init__(self, value, display_value, e_type, childs, id_in_doc, group_index):
""" id_in_doc: Id, utilized witin the internal services
"""
super(DocumentEntity, self).__init__(value=value,
e_type=e_type,
display_value=display_value,
childs=childs,
group_index=group_index)
self.__id = id_in_doc

Expand Down
38 changes: 29 additions & 9 deletions arekit/common/docs/parsed/providers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,41 @@ def Name(self):
def init_parsed_doc(self, parsed_doc):
assert(isinstance(parsed_doc, ParsedDocument))

def __iter_childs_and_root_node(entity):
""" Note: Entity has childs and we would like to iterate over childs
to conider them as well as keep the root Node.
"""
# We first add childs.
for child_entity in entity.iter_childs():
yield child_entity, True

# Return Root node.
yield entity, False

self._doc_entities = []
self.__entity_map.clear()

for index, entity in enumerate(parsed_doc.iter_entities()):
current_id = 0
for _, entity in enumerate(parsed_doc.iter_entities()):

child_doc_entities = []
for tree_entity, is_child in __iter_childs_and_root_node(entity):

doc_entity = DocumentEntity(id_in_doc=current_id,
value=tree_entity.Value,
e_type=tree_entity.Type,
display_value=tree_entity.DisplayValue,
childs=None if is_child else child_doc_entities,
group_index=tree_entity.GroupIndex)
current_id += 1

doc_entity = DocumentEntity(id_in_doc=index,
value=entity.Value,
e_type=entity.Type,
display_value=entity.DisplayValue,
group_index=entity.GroupIndex)
if is_child:
child_doc_entities.append(doc_entity)

self._doc_entities.append(doc_entity)
self._doc_entities.append(doc_entity)

if self.__entity_index_func is not None:
self.__entity_map[self.__entity_index_func(entity)] = doc_entity
if self.__entity_index_func is not None:
self.__entity_map[self.__entity_index_func(tree_entity)] = doc_entity

def get_document_entity(self, entity):
""" Maps entity to the related one with DocumentEntity type
Expand Down
28 changes: 16 additions & 12 deletions arekit/common/docs/parsed/providers/entity_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def init_parsed_doc(self, parsed_doc):
super(EntityServiceProvider, self).init_parsed_doc(parsed_doc)
assert(isinstance(parsed_doc, ParsedDocument))
self.__iter_raw_terms_func = lambda: parsed_doc.iter_terms(filter_func=None, term_only=False)
self.__init_entity_positions()
self.__entity_positions = self.__calculate_entity_positions()

# region public 'extract' methods

Expand Down Expand Up @@ -147,25 +147,29 @@ def __get_end_id(text_opinion, end_type):
assert(end_type == EntityEndType.Source or end_type == EntityEndType.Target)
return text_opinion.SourceId if end_type == EntityEndType.Source else text_opinion.TargetId

def __init_entity_positions(self):
self.__entity_positions = self.__calculate_entity_positions()

def __calculate_entity_positions(self):
""" Note: here we consider the same order as in self._entities.
"""
positions = []
t_ind_in_doc = 0
t_ind_in_doc = -1

positions = {}
for s_ind, t_ind_in_sent, term in self.__iter_raw_terms_func():

if isinstance(term, Entity):
position = TermPosition(term_ind_in_doc=t_ind_in_doc,
term_ind_in_sent=t_ind_in_sent,
s_ind=s_ind)
positions.append(position)

t_ind_in_doc += 1

if not isinstance(term, Entity):
continue

# We consider that entities within a single tree has the same positions.
for tree_entity in list(term.iter_childs()) + [term]:

key = self.get_document_entity(tree_entity).IdInDocument
assert(key not in positions)

positions[key] = TermPosition(term_ind_in_doc=t_ind_in_doc,
term_ind_in_sent=t_ind_in_sent,
s_ind=s_ind)

return positions

# endregion
11 changes: 10 additions & 1 deletion arekit/common/entities/base.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
class Entity(object):

def __init__(self, value, e_type, display_value=None, group_index=None):
def __init__(self, value, e_type, childs=None, display_value=None, group_index=None):
assert(isinstance(value, str) and len(value) > 0)
assert(isinstance(e_type, str) or e_type is None)
assert(isinstance(display_value, str) or display_value is None)
assert(isinstance(group_index, int) or group_index is None)
assert(isinstance(childs, list) or childs is None)
self.__value = value
self.__type = e_type
self.__display_value = display_value
self.__group_index = group_index
self.__childs = childs

@property
def GroupIndex(self):
Expand Down Expand Up @@ -40,3 +42,10 @@ def set_group_index(self, value):
assert(isinstance(value, int) and value >= -1)
assert(self.__group_index is None)
self.__group_index = value

def iter_childs(self):
if self.__childs is None:
return
yield
for child in self.__childs:
yield child
1 change: 1 addition & 0 deletions arekit/contrib/source/brat/annot.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def handle_entity(args):
e_type=e_str_type,
index_begin=int(e_begin),
index_end=int(e_end),
childs=None,
value=args[2].strip())

@staticmethod
Expand Down
24 changes: 2 additions & 22 deletions arekit/contrib/source/brat/entities/compound.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,9 @@ class BratCompoundEntity(BratEntity):
""" Entity which contains the hierarchy of the other entities.
"""

def __init__(self, id_in_doc, value, e_type, root, entities, index_begin, index_end,
display_value=None, group_index=None):
assert(isinstance(entities, list))
assert(isinstance(root, BratCompoundEntity) or root is None)
super(BratCompoundEntity, self).__init__(value=value, e_type=e_type,
id_in_doc=id_in_doc,
index_begin=index_begin,
index_end=index_end,
display_value=display_value,
group_index=group_index)
self.__entities = entities
self.__root = root

@classmethod
def from_list(cls, root, childs):
assert(isinstance(root, BratEntity))
assert(isinstance(childs, list) and len(childs) > 0)
return cls(id_in_doc=root.ID, value=root.Value, e_type=root.Type, root=None,
entities=childs, index_begin=root.IndexBegin, index_end=root.IndexEnd)

@property
def Root(self):
return self.__root

def iter_childs(self):
return iter(self.__entities)
return cls(id_in_doc=root.ID, value=root.Value, e_type=root.Type, childs=childs,
index_begin=root.IndexBegin, index_end=root.IndexEnd)
4 changes: 2 additions & 2 deletions arekit/contrib/source/brat/entities/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class BratEntity(Entity):
Provides bounds, i.e. char indices in related sentence.
"""

def __init__(self, id_in_doc, e_type, index_begin, index_end, value, display_value=None, group_index=None):
def __init__(self, id_in_doc, e_type, index_begin, index_end, value, childs, display_value=None, group_index=None):
""" index_begin: int
- char index (in case of string type of `text`)
- term index (in case of list type of `text`)
Expand All @@ -17,7 +17,7 @@ def __init__(self, id_in_doc, e_type, index_begin, index_end, value, display_val
assert(isinstance(e_type, str))
assert(isinstance(index_begin, int))
assert(isinstance(index_end, int))
super(BratEntity, self).__init__(value=value, e_type=e_type,
super(BratEntity, self).__init__(value=value, e_type=e_type, childs=childs,
display_value=display_value, group_index=group_index)

self.__e_type = e_type
Expand Down

0 comments on commit 8c1ee11

Please sign in to comment.