diff --git a/arekit/common/news/parsed/base.py b/arekit/common/news/parsed/base.py index 5de19409..5265a5ba 100644 --- a/arekit/common/news/parsed/base.py +++ b/arekit/common/news/parsed/base.py @@ -124,13 +124,16 @@ def get_entity_value(self, id_in_document): # region public 'modify' methods - def modify_parsed_sentences(self, sentence_upd_func): - assert(callable(sentence_upd_func)) + def modify_parsed_sentences(self, sentence_objs_upd_func, get_obj_bound_func): + assert(callable(sentence_objs_upd_func)) + assert(callable(get_obj_bound_func)) for s_index, sentence in enumerate(self.__parsed_sentences): - updated = sentence_upd_func(sentence) - assert(isinstance(updated, ParsedText)) - self.__parsed_sentences[s_index] = updated + assert(isinstance(sentence, ParsedText)) + + sentence.modify_by_bounded_objects( + modified_objs=sentence_objs_upd_func(sentence), + get_obj_bound_func=get_obj_bound_func) self.__init_entity_positions() diff --git a/arekit/processing/frames/annot.py b/arekit/processing/frames/annot.py deleted file mode 100644 index 1ffaf672..00000000 --- a/arekit/processing/frames/annot.py +++ /dev/null @@ -1,67 +0,0 @@ -from arekit.common.frame_variants.collection import FrameVariantsCollection -from arekit.common.languages.ru.mods import RussianLanguageMods -from arekit.common.text_frame_variant import TextFrameVariant -from arekit.common.languages.mods import BaseLanguageMods - - -class FrameVariantsAnnotationHelper(object): - - # region private methods - - @staticmethod - def __check_all_words_within(terms, start_index, last_index): - for i in range(start_index, last_index + 1): - if not isinstance(terms[i], str): - return False - return True - - @staticmethod - def __get_preposition(terms, index): - return terms[index-1] if index > 0 else None - - # endregion - - @staticmethod - def iter_frames_from_lemmas(frame_variants, lemmas, locale_mods=RussianLanguageMods): - """ - Considered to perform frames annotation across lemmatized terms. - """ - assert(isinstance(frame_variants, FrameVariantsCollection)) - assert(isinstance(lemmas, list)) - assert(issubclass(locale_mods, BaseLanguageMods)) - - start_ind = 0 - last_ind = 0 - max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()]) - while start_ind < len(lemmas): - for ctx_size in reversed(list(range(1, max_variant_len))): - - last_ind = start_ind + ctx_size - 1 - - if not(last_ind < len(lemmas)): - continue - - is_all_words_within = FrameVariantsAnnotationHelper.__check_all_words_within( - terms=lemmas, - start_index=start_ind, - last_index=last_ind) - - if not is_all_words_within: - continue - - ctx_value = " ".join(lemmas[start_ind:last_ind + 1]) - - if not frame_variants.has_variant(ctx_value): - continue - - prep_term = FrameVariantsAnnotationHelper.__get_preposition(terms=lemmas, - index=start_ind) - - yield TextFrameVariant( - variant=frame_variants.get_variant_by_value(ctx_value), - start_index=start_ind, - is_inverted=locale_mods.is_negation_word(prep_term) if prep_term is not None else False) - - break - - start_ind = last_ind + 1 diff --git a/arekit/processing/frames/parser.py b/arekit/processing/frames/parser.py index 03e35757..8ff901e4 100644 --- a/arekit/processing/frames/parser.py +++ b/arekit/processing/frames/parser.py @@ -1,12 +1,7 @@ -import collections - from arekit.common.frame_variants.collection import FrameVariantsCollection from arekit.common.languages.ru.mods import RussianLanguageMods from arekit.common.text_frame_variant import TextFrameVariant from arekit.common.languages.mods import BaseLanguageMods -from arekit.processing.frames.annot import FrameVariantsAnnotationHelper -from arekit.processing.text.enums import TermFormat -from arekit.processing.text.parsed import ParsedText class FrameVariantsParser(object): @@ -14,50 +9,59 @@ class FrameVariantsParser(object): # region private methods @staticmethod - def __insert_frame_variants_into_raw_terms_list(raw_terms_list, frame_variants_iter): - assert(isinstance(raw_terms_list, list)) - assert(isinstance(frame_variants_iter, collections.Iterable)) - - def __remove(terms, start, end): - while end > start: - del terms[start] - end -= 1 - - for variant in reversed(list(frame_variants_iter)): - assert (isinstance(variant, TextFrameVariant)) - variant_bound = variant.get_bound() - __remove(terms=raw_terms_list, - start=variant_bound.Position, - end=variant_bound.Position + variant_bound.Length) - raw_terms_list.insert(variant_bound.Position, variant) + def __check_all_words_within(terms, start_index, last_index): + for i in range(start_index, last_index + 1): + if not isinstance(terms[i], str): + return False + return True - return raw_terms_list + @staticmethod + def __get_preposition(terms, index): + return terms[index-1] if index > 0 else None # endregion - # TODO. #218. There is no need in parsed text here! @staticmethod - def parse_frames_in_parsed_text(frame_variants_collection, parsed_text, locale_mods=RussianLanguageMods): - assert(isinstance(frame_variants_collection, FrameVariantsCollection)) - assert(isinstance(parsed_text, ParsedText)) + def iter_frames_from_lemmas(frame_variants, lemmas, locale_mods=RussianLanguageMods): + """ + Considered to perform frames annotation across lemmatized terms. + """ + assert(isinstance(frame_variants, FrameVariantsCollection)) + assert(isinstance(lemmas, list)) assert(issubclass(locale_mods, BaseLanguageMods)) - # TODO. 218. Move lemmatization outside (reason: this is not a part of the frames annotation logic) - lemmas = [locale_mods.replace_specific_word_chars(lemma) if isinstance(lemma, str) else lemma - for lemma in parsed_text.iter_terms(term_format=TermFormat.Lemma)] + start_ind = 0 + last_ind = 0 + max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()]) + while start_ind < len(lemmas): + for ctx_size in reversed(list(range(1, max_variant_len))): + + last_ind = start_ind + ctx_size - 1 + + if not(last_ind < len(lemmas)): + continue + + is_all_words_within = FrameVariantsParser.__check_all_words_within( + terms=lemmas, + start_index=start_ind, + last_index=last_ind) + + if not is_all_words_within: + continue + + ctx_value = " ".join(lemmas[start_ind:last_ind + 1]) + + if not frame_variants.has_variant(ctx_value): + continue - frame_variants_iter = FrameVariantsAnnotationHelper.iter_frames_from_lemmas( - frame_variants=frame_variants_collection, - lemmas=lemmas, - locale_mods=locale_mods) + prep_term = FrameVariantsParser.__get_preposition(terms=lemmas, + index=start_ind) - if frame_variants_iter is None: - return parsed_text + yield TextFrameVariant( + variant=frame_variants.get_variant_by_value(ctx_value), + start_index=start_ind, + is_inverted=locale_mods.is_negation_word(prep_term) if prep_term is not None else False) - # TODO. 218 Return updated terms only - updated_terms = FrameVariantsParser.__insert_frame_variants_into_raw_terms_list( - raw_terms_list=list(parsed_text.iter_terms(TermFormat.Raw)), - frame_variants_iter=frame_variants_iter) + break - # TODO. Remove parsed text from here. - return parsed_text.copy_modified(terms=updated_terms) + start_ind = last_ind + 1 diff --git a/arekit/processing/text/parsed.py b/arekit/processing/text/parsed.py index 0c4a6679..b4a0887b 100644 --- a/arekit/processing/text/parsed.py +++ b/arekit/processing/text/parsed.py @@ -1,3 +1,5 @@ +import collections + from arekit.processing.lemmatization.base import Stemmer from arekit.processing.text.enums import TermFormat @@ -28,15 +30,51 @@ def __init__(self, terms, stemmer=None): # TODO. leave here. self.__stemmer = stemmer + self.__update_lemmatization() + + def __update_lemmatization(self): # TODO. leave here. - if stemmer is not None: - self.__lemmatize(stemmer) + if self.__stemmer is not None: + self.__lemmatize(self.__stemmer) # TODO. to base as an abstract-like method. (implementation here) def copy_modified(self, terms): return ParsedText(terms=terms, stemmer=self.__stemmer) + def modify_by_bounded_objects(self, modified_objs, get_obj_bound_func): + assert(isinstance(modified_objs, collections.Iterable)) + assert(callable(get_obj_bound_func)) + + def __remove(terms, start, end): + while end > start: + del terms[start] + end -= 1 + + if modified_objs is None: + return + + objs_list = list(modified_objs) + + # setup default position. + prev_position = len(objs_list) + 1 + local_terms = self.__terms + + for obj in reversed(objs_list): + obj_bound = get_obj_bound_func(obj) + + if obj_bound.Position > prev_position: + raise Exception("objs list has incorrect order. It is expected that instances " + "ordered by positions (ascending)") + + __remove(terms=local_terms, + start=obj_bound.Position, + end=obj_bound.Position + obj_bound.Length) + + local_terms.insert(obj_bound.Position, obj) + + self.__update_lemmatization() + # endregion # TODO. move to base. diff --git a/arekit/processing/text/parser.py b/arekit/processing/text/parser.py index 60406d7f..b6c13400 100644 --- a/arekit/processing/text/parser.py +++ b/arekit/processing/text/parser.py @@ -3,11 +3,14 @@ from arekit.common.entities.base import Entity from arekit.common.frame_variants.collection import FrameVariantsCollection +from arekit.common.languages.mods import BaseLanguageMods +from arekit.common.languages.ru.mods import RussianLanguageMods from arekit.common.news.base import News from arekit.common.news.parse_options import NewsParseOptions from arekit.common.news.parsed.base import ParsedNews from arekit.common.utils import split_by_whitespaces from arekit.processing.frames.parser import FrameVariantsParser +from arekit.processing.text.enums import TermFormat from arekit.processing.text.parsed import ParsedText from arekit.processing.text.tokens import Tokens from arekit.processing.text.token import Token @@ -45,6 +48,12 @@ def parse_news(news, parse_options): return parsed_news + @staticmethod + def __to_lemmas(locale_mods, parsed_text): + assert(issubclass(locale_mods, BaseLanguageMods)) + return [locale_mods.replace_specific_word_chars(lemma) if isinstance(lemma, str) else lemma + for lemma in parsed_text.iter_terms(term_format=TermFormat.Lemma)] + # region private methods @staticmethod @@ -92,9 +101,12 @@ def __parse_frame_variants(parsed_news, frame_variant_collection): return parsed_news.modify_parsed_sentences( - lambda sentence: FrameVariantsParser.parse_frames_in_parsed_text( - frame_variants_collection=frame_variant_collection, - parsed_text=sentence)) + sentence_objs_upd_func=lambda sentence: + FrameVariantsParser.iter_frames_from_lemmas( + frame_variants=frame_variant_collection, + lemmas=TextParser.__to_lemmas(locale_mods=RussianLanguageMods, + parsed_text=sentence)), + get_obj_bound_func=lambda variant: variant.get_bound()) @staticmethod def __parse_string_list(terms_iter, skip_term, stemmer=None):