From a35d04aafdc33644cd596aff5b1a54df191a228e Mon Sep 17 00:00:00 2001 From: Nicolay Rusnachenko Date: Fri, 16 Sep 2022 18:07:23 +0300 Subject: [PATCH] #398 related --- .../contrib/source/brat/entities/compound.py | 24 ++++++++++ .../contrib/source/brat/sentences_reader.py | 27 ++++++++--- .../source/test_brat_compound_entities.py | 45 +++++++++++++++++++ 3 files changed, 90 insertions(+), 6 deletions(-) create mode 100644 arekit/contrib/source/brat/entities/compound.py create mode 100644 tests/contrib/source/test_brat_compound_entities.py diff --git a/arekit/contrib/source/brat/entities/compound.py b/arekit/contrib/source/brat/entities/compound.py new file mode 100644 index 00000000..1a51e1cd --- /dev/null +++ b/arekit/contrib/source/brat/entities/compound.py @@ -0,0 +1,24 @@ +from arekit.contrib.source.brat.entities.entity import BratEntity + + +class BratCompoundEntity(BratEntity): + """ Entity which contains the hierarchy of the other entities. + """ + + def __init__(self, id_in_doc, value, e_type, root, entities, index_begin, index_end, group_index=None): + assert(isinstance(entities, list)) + assert(isinstance(root, BratCompoundEntity) or root is None) + super(BratCompoundEntity, self).__init__(value=value, e_type=e_type, + id_in_doc=id_in_doc, + index_begin=index_begin, + index_end=index_end, + group_index=group_index) + self.__entities = entities + self.__root = root + + @property + def Root(self): + return self.__root + + def iter_childs(self): + return iter(self.__entities) diff --git a/arekit/contrib/source/brat/sentences_reader.py b/arekit/contrib/source/brat/sentences_reader.py index 4faa534d..ade21af7 100644 --- a/arekit/contrib/source/brat/sentences_reader.py +++ b/arekit/contrib/source/brat/sentences_reader.py @@ -10,10 +10,10 @@ def from_file(input_file, entities, line_handler=None, skip_entity_func=None): assert(isinstance(entities, EntityCollection)) assert(callable(skip_entity_func) or skip_entity_func is None) - sentences_data = BratDocumentSentencesReader.__parse_sentences( + sentences_data = BratDocumentSentencesReader._parse_sentences( input_file=input_file, line_handler=line_handler) - sentence_entities = BratDocumentSentencesReader.__parse_entities( + sentence_entities = BratDocumentSentencesReader._parse_entities( sentences_data=sentences_data, entities=entities, skip_entity_func=skip_entity_func) @@ -28,9 +28,24 @@ def from_file(input_file, entities, line_handler=None, skip_entity_func=None): return brat_sentences - # endregion + @staticmethod + def from_sentences_data(entities, sentences_data, skip_entity_func=None): + assert(isinstance(entities, EntityCollection)) + + sentence_entities = BratDocumentSentencesReader._parse_entities( + sentences_data=sentences_data, + entities=entities, + skip_entity_func=skip_entity_func) - # region private methods + # Convert all the content to brat sentences. + brat_sentences = [] + for s_ind, s_dict in enumerate(sentences_data): + brat_sentence = BratSentence(text=s_dict["text"], + index_begin=s_dict["ind_begin"], + entities=sentence_entities[s_ind]) + brat_sentences.append(brat_sentence) + + return brat_sentences @staticmethod def __is_sentence_contains(sentence_data, entity): @@ -40,7 +55,7 @@ def __is_sentence_contains(sentence_data, entity): entity.IndexEnd <= sentence_data["ind_end"] @staticmethod - def __parse_entities(sentences_data, entities, skip_entity_func): + def _parse_entities(sentences_data, entities, skip_entity_func): """ Sentences is a list of json-like data (dictionaries). """ assert(isinstance(sentences_data, list)) @@ -91,7 +106,7 @@ def __parse_entities(sentences_data, entities, skip_entity_func): return entities_in_sentences @staticmethod - def __parse_sentences(input_file, line_handler): + def _parse_sentences(input_file, line_handler): assert(callable(line_handler) or line_handler is None) sentences = [] line_start = 0 diff --git a/tests/contrib/source/test_brat_compound_entities.py b/tests/contrib/source/test_brat_compound_entities.py new file mode 100644 index 00000000..9084a56f --- /dev/null +++ b/tests/contrib/source/test_brat_compound_entities.py @@ -0,0 +1,45 @@ +import unittest + +from arekit.common.bound import Bound +from arekit.common.entities.collection import EntityCollection +from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders +from arekit.contrib.source.brat.entities.entity import BratEntity +from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader +from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper +from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection + + +class TestCompoundEntites(unittest.TestCase): + + text = "мама мыла раму" + entities = [ + BratEntity(id_in_doc="T1", e_type="PERSON", index_begin=0, index_end=4, value="мама"), + BratEntity(id_in_doc="T2", e_type="VERB", index_begin=5, index_end=9, value="мыла"), + BratEntity(id_in_doc="T3", e_type="OBJECT", index_begin=10, index_end=14, value="раму"), + BratEntity(id_in_doc="T3", e_type="ACTION", index_begin=0, index_end=9, value="мама мыла") + ] + + def test(self): + s_data = [ + {"text": self.text, "ind_begin": 0, "ind_end": len(self.text)} + ] + + synonyms = StemmerBasedSynonymCollection( + iter_group_values_lists=[], stemmer=MystemWrapper(), is_read_only=False, debug=False) + + collection = EntityCollection( + self.entities, + value_to_group_id_func=lambda value: + SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(synonyms, value)) + + sentences = BratDocumentSentencesReader.from_sentences_data(entities=collection, + sentences_data=s_data) + + for sentence in sentences: + for e, b in sentence.iter_entity_with_local_bounds(): + assert(isinstance(b, Bound)) + print(e.Value, b.Position, b.Position + b.Length) + + +if __name__ == '__main__': + unittest.main()