Skip to content

Commit

Permalink
#398 related
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Sep 16, 2022
1 parent 5fc935a commit a35d04a
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 6 deletions.
24 changes: 24 additions & 0 deletions arekit/contrib/source/brat/entities/compound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from arekit.contrib.source.brat.entities.entity import BratEntity


class BratCompoundEntity(BratEntity):
""" Entity which contains the hierarchy of the other entities.
"""

def __init__(self, id_in_doc, value, e_type, root, entities, index_begin, index_end, group_index=None):
assert(isinstance(entities, list))
assert(isinstance(root, BratCompoundEntity) or root is None)
super(BratCompoundEntity, self).__init__(value=value, e_type=e_type,
id_in_doc=id_in_doc,
index_begin=index_begin,
index_end=index_end,
group_index=group_index)
self.__entities = entities
self.__root = root

@property
def Root(self):
return self.__root

def iter_childs(self):
return iter(self.__entities)
27 changes: 21 additions & 6 deletions arekit/contrib/source/brat/sentences_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ def from_file(input_file, entities, line_handler=None, skip_entity_func=None):
assert(isinstance(entities, EntityCollection))
assert(callable(skip_entity_func) or skip_entity_func is None)

sentences_data = BratDocumentSentencesReader.__parse_sentences(
sentences_data = BratDocumentSentencesReader._parse_sentences(
input_file=input_file, line_handler=line_handler)

sentence_entities = BratDocumentSentencesReader.__parse_entities(
sentence_entities = BratDocumentSentencesReader._parse_entities(
sentences_data=sentences_data,
entities=entities,
skip_entity_func=skip_entity_func)
Expand All @@ -28,9 +28,24 @@ def from_file(input_file, entities, line_handler=None, skip_entity_func=None):

return brat_sentences

# endregion
@staticmethod
def from_sentences_data(entities, sentences_data, skip_entity_func=None):
assert(isinstance(entities, EntityCollection))

sentence_entities = BratDocumentSentencesReader._parse_entities(
sentences_data=sentences_data,
entities=entities,
skip_entity_func=skip_entity_func)

# region private methods
# Convert all the content to brat sentences.
brat_sentences = []
for s_ind, s_dict in enumerate(sentences_data):
brat_sentence = BratSentence(text=s_dict["text"],
index_begin=s_dict["ind_begin"],
entities=sentence_entities[s_ind])
brat_sentences.append(brat_sentence)

return brat_sentences

@staticmethod
def __is_sentence_contains(sentence_data, entity):
Expand All @@ -40,7 +55,7 @@ def __is_sentence_contains(sentence_data, entity):
entity.IndexEnd <= sentence_data["ind_end"]

@staticmethod
def __parse_entities(sentences_data, entities, skip_entity_func):
def _parse_entities(sentences_data, entities, skip_entity_func):
""" Sentences is a list of json-like data (dictionaries).
"""
assert(isinstance(sentences_data, list))
Expand Down Expand Up @@ -91,7 +106,7 @@ def __parse_entities(sentences_data, entities, skip_entity_func):
return entities_in_sentences

@staticmethod
def __parse_sentences(input_file, line_handler):
def _parse_sentences(input_file, line_handler):
assert(callable(line_handler) or line_handler is None)
sentences = []
line_start = 0
Expand Down
45 changes: 45 additions & 0 deletions tests/contrib/source/test_brat_compound_entities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import unittest

from arekit.common.bound import Bound
from arekit.common.entities.collection import EntityCollection
from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
from arekit.contrib.source.brat.entities.entity import BratEntity
from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection


class TestCompoundEntites(unittest.TestCase):

text = "мама мыла раму"
entities = [
BratEntity(id_in_doc="T1", e_type="PERSON", index_begin=0, index_end=4, value="мама"),
BratEntity(id_in_doc="T2", e_type="VERB", index_begin=5, index_end=9, value="мыла"),
BratEntity(id_in_doc="T3", e_type="OBJECT", index_begin=10, index_end=14, value="раму"),
BratEntity(id_in_doc="T3", e_type="ACTION", index_begin=0, index_end=9, value="мама мыла")
]

def test(self):
s_data = [
{"text": self.text, "ind_begin": 0, "ind_end": len(self.text)}
]

synonyms = StemmerBasedSynonymCollection(
iter_group_values_lists=[], stemmer=MystemWrapper(), is_read_only=False, debug=False)

collection = EntityCollection(
self.entities,
value_to_group_id_func=lambda value:
SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(synonyms, value))

sentences = BratDocumentSentencesReader.from_sentences_data(entities=collection,
sentences_data=s_data)

for sentence in sentences:
for e, b in sentence.iter_entity_with_local_bounds():
assert(isinstance(b, Bound))
print(e.Value, b.Position, b.Position + b.Length)


if __name__ == '__main__':
unittest.main()

0 comments on commit a35d04a

Please sign in to comment.