Skip to content

Commit

Permalink
#398 -- related simple support for brat formatting.
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Sep 20, 2022
1 parent a35d04a commit e860c97
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 13 deletions.
29 changes: 29 additions & 0 deletions arekit/common/bound.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,32 @@ def Length(self):
return self.__length

# endregion

def itersects_with(self, other):
begin = self.__pos
end = self.__pos + self.__length
other_begin = other.Position
other_end_included = other.Position + other.Length - 1
if end > other_begin >= begin:
return True
if end > other_end_included >= begin:
return True
if other_begin < begin and end <= other_end_included:
return True
return False

def intersect(self, other):
begin = self.__pos
end = self.__pos + self.__length
other_begin = other.Position
other_end = other.Position + other.Length
actual_begin = min(begin, other_begin)
actual_length = max(end, other_end) - actual_begin
return Bound(pos=actual_begin, length=actual_length)

def contains(self, other):
begin = self.__pos
end = self.__pos + self.__length
other_begin = other.Position
other_end = other.Position + other.Length
return begin <= other_begin and end >= other_end
7 changes: 7 additions & 0 deletions arekit/contrib/source/brat/entities/compound.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ def __init__(self, id_in_doc, value, e_type, root, entities, index_begin, index_
self.__entities = entities
self.__root = root

@classmethod
def from_list(cls, root, childs):
assert(isinstance(root, BratEntity))
assert(isinstance(childs, list) and len(childs) > 0)
return cls(id_in_doc=root.ID, value=root.Value, e_type=root.Type, root=None,
entities=childs, index_begin=root.IndexBegin, index_end=root.IndexEnd)

@property
def Root(self):
return self.__root
Expand Down
47 changes: 40 additions & 7 deletions arekit/contrib/source/brat/sentence.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from functools import cmp_to_key

from arekit.common.bound import Bound
from arekit.common.news.sentence import BaseNewsSentence
from arekit.contrib.source.brat.entities.compound import BratCompoundEntity
from arekit.contrib.source.brat.entities.entity import BratEntity


class BratSentence(BaseNewsSentence):
Expand All @@ -21,16 +25,45 @@ def __init__(self, text, index_begin, entities):
self.__index_begin = index_begin
self.__entities = entities

def iter_entity_with_local_bounds(self, avoid_intersection=True):
last_position = -1
@staticmethod
def cmp_entities(a, b):
assert(isinstance(a, BratEntity))
assert(isinstance(b, BratEntity))
if a.IndexBegin != b.IndexBegin:
# Ordered by appearance
return a.IndexBegin - b.IndexBegin
else:
# Ordered by length first
b_length = b.IndexEnd - b.IndexBegin
a_length = a.IndexEnd - a.IndexBegin
return b_length - a_length

def iter_entity_with_local_bounds(self):
self.__entities.sort(key=cmp_to_key(lambda a, b: self.cmp_entities(a, b)))

bounds_and_entities = []

# Merging nested entities.
for entity in self.__entities:
start = entity.IndexBegin - self.__index_begin
end = entity.IndexEnd - self.__index_begin
bound = Bound(pos=start, length=end - start)

updated = False
if len(bounds_and_entities) > 0:
last_bound, last_entities = bounds_and_entities[-1]
if bound.itersects_with(last_bound):
# Update.
last_entities.append(entity)
bounds_and_entities[-1] = (bound.intersect(last_bound), last_entities)
updated = True

if start <= last_position and avoid_intersection:
# intersected with the previous one.
continue
if not updated:
bounds_and_entities.append((bound, [entity]))

yield entity, Bound(pos=start, length=end - start)
last_position = end
# Returning result.
for item in bounds_and_entities:
bound, entities = item
entity = entities[0] if len(entities) == 1 else \
BratCompoundEntity.from_list(root=entities[0], childs=entities[1:])
yield entity, bound
13 changes: 9 additions & 4 deletions tests/contrib/source/test_brat_compound_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from arekit.common.bound import Bound
from arekit.common.entities.collection import EntityCollection
from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
from arekit.contrib.source.brat.entities.compound import BratCompoundEntity
from arekit.contrib.source.brat.entities.entity import BratEntity
from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
Expand All @@ -14,9 +15,9 @@ class TestCompoundEntites(unittest.TestCase):
text = "мама мыла раму"
entities = [
BratEntity(id_in_doc="T1", e_type="PERSON", index_begin=0, index_end=4, value="мама"),
BratEntity(id_in_doc="T2", e_type="VERB", index_begin=5, index_end=9, value="мыла"),
BratEntity(id_in_doc="T3", e_type="OBJECT", index_begin=10, index_end=14, value="раму"),
BratEntity(id_in_doc="T3", e_type="ACTION", index_begin=0, index_end=9, value="мама мыла")
BratEntity(id_in_doc="T2", e_type="VERB", index_begin=5, index_end=8, value="мыл"),
BratEntity(id_in_doc="T3", e_type="OBJECT", index_begin=9, index_end=13, value="раму"),
BratEntity(id_in_doc="T3", e_type="ACTION", index_begin=0, index_end=8, value="мама мыл")
]

def test(self):
Expand All @@ -38,7 +39,11 @@ def test(self):
for sentence in sentences:
for e, b in sentence.iter_entity_with_local_bounds():
assert(isinstance(b, Bound))
print(e.Value, b.Position, b.Position + b.Length)
print(type(e))
print("{} ({}, {})".format(e.Value, b.Position, b.Position + b.Length))
if isinstance(e, BratCompoundEntity):
for ee in e.iter_childs():
print("\t{} ({}, {}) [{}]".format(ee.Value, ee.IndexBegin, ee.IndexEnd, ee.Type))


if __name__ == '__main__':
Expand Down
12 changes: 10 additions & 2 deletions tests/tutorials/test_tutorial_collection_binding.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from arekit.common.entities.collection import EntityCollection
from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
from arekit.contrib.source.brat.annot import BratAnnotationParser
from arekit.contrib.source.brat.entities.compound import BratCompoundEntity
from arekit.contrib.source.brat.news import BratNews
from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
from arekit.contrib.source.zip_utils import ZipArchiveUtils
Expand Down Expand Up @@ -96,9 +97,16 @@ def test_reading(self):
for sentence in news.iter_sentences():
print(sentence.Text.strip())
for entity, bound in sentence.iter_entity_with_local_bounds():
print("{}: ['{}',{}, {}]".format(
print("{}: ['{}',{}, {}] {}".format(
entity.ID, entity.Value, entity.Type,
"-".join([str(bound.Position), str(bound.Position + bound.Length)])))
"-".join([str(bound.Position), str(bound.Position + bound.Length)]),
"[COMPOUND]" if isinstance(entity, BratCompoundEntity) else ""))

if not isinstance(entity, BratCompoundEntity):
continue

for child in entity.iter_childs():
print("\t{}: ['{}',{}]".format(child.ID, child.Value, child.Type))

for brat_relation in news.Relations:
print(brat_relation.SourceID, brat_relation.TargetID, brat_relation.Type)
Expand Down

0 comments on commit e860c97

Please sign in to comment.