Skip to content

Commit

Permalink
#218. Refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Nov 30, 2021
1 parent 1e30876 commit af01adb
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 118 deletions.
13 changes: 8 additions & 5 deletions arekit/common/news/parsed/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,16 @@ def get_entity_value(self, id_in_document):

# region public 'modify' methods

def modify_parsed_sentences(self, sentence_upd_func):
assert(callable(sentence_upd_func))
def modify_parsed_sentences(self, sentence_objs_upd_func, get_obj_bound_func):
assert(callable(sentence_objs_upd_func))
assert(callable(get_obj_bound_func))

for s_index, sentence in enumerate(self.__parsed_sentences):
updated = sentence_upd_func(sentence)
assert(isinstance(updated, ParsedText))
self.__parsed_sentences[s_index] = updated
assert(isinstance(sentence, ParsedText))

sentence.modify_by_bounded_objects(
modified_objs=sentence_objs_upd_func(sentence),
get_obj_bound_func=get_obj_bound_func)

self.__init_entity_positions()

Expand Down
67 changes: 0 additions & 67 deletions arekit/processing/frames/annot.py

This file was deleted.

86 changes: 45 additions & 41 deletions arekit/processing/frames/parser.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,67 @@
import collections

from arekit.common.frame_variants.collection import FrameVariantsCollection
from arekit.common.languages.ru.mods import RussianLanguageMods
from arekit.common.text_frame_variant import TextFrameVariant
from arekit.common.languages.mods import BaseLanguageMods
from arekit.processing.frames.annot import FrameVariantsAnnotationHelper
from arekit.processing.text.enums import TermFormat
from arekit.processing.text.parsed import ParsedText


class FrameVariantsParser(object):

# region private methods

@staticmethod
def __insert_frame_variants_into_raw_terms_list(raw_terms_list, frame_variants_iter):
assert(isinstance(raw_terms_list, list))
assert(isinstance(frame_variants_iter, collections.Iterable))

def __remove(terms, start, end):
while end > start:
del terms[start]
end -= 1

for variant in reversed(list(frame_variants_iter)):
assert (isinstance(variant, TextFrameVariant))
variant_bound = variant.get_bound()
__remove(terms=raw_terms_list,
start=variant_bound.Position,
end=variant_bound.Position + variant_bound.Length)
raw_terms_list.insert(variant_bound.Position, variant)
def __check_all_words_within(terms, start_index, last_index):
for i in range(start_index, last_index + 1):
if not isinstance(terms[i], str):
return False
return True

return raw_terms_list
@staticmethod
def __get_preposition(terms, index):
return terms[index-1] if index > 0 else None

# endregion

# TODO. #218. There is no need in parsed text here!
@staticmethod
def parse_frames_in_parsed_text(frame_variants_collection, parsed_text, locale_mods=RussianLanguageMods):
assert(isinstance(frame_variants_collection, FrameVariantsCollection))
assert(isinstance(parsed_text, ParsedText))
def iter_frames_from_lemmas(frame_variants, lemmas, locale_mods=RussianLanguageMods):
"""
Considered to perform frames annotation across lemmatized terms.
"""
assert(isinstance(frame_variants, FrameVariantsCollection))
assert(isinstance(lemmas, list))
assert(issubclass(locale_mods, BaseLanguageMods))

# TODO. 218. Move lemmatization outside (reason: this is not a part of the frames annotation logic)
lemmas = [locale_mods.replace_specific_word_chars(lemma) if isinstance(lemma, str) else lemma
for lemma in parsed_text.iter_terms(term_format=TermFormat.Lemma)]
start_ind = 0
last_ind = 0
max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()])
while start_ind < len(lemmas):
for ctx_size in reversed(list(range(1, max_variant_len))):

last_ind = start_ind + ctx_size - 1

if not(last_ind < len(lemmas)):
continue

is_all_words_within = FrameVariantsParser.__check_all_words_within(
terms=lemmas,
start_index=start_ind,
last_index=last_ind)

if not is_all_words_within:
continue

ctx_value = " ".join(lemmas[start_ind:last_ind + 1])

if not frame_variants.has_variant(ctx_value):
continue

frame_variants_iter = FrameVariantsAnnotationHelper.iter_frames_from_lemmas(
frame_variants=frame_variants_collection,
lemmas=lemmas,
locale_mods=locale_mods)
prep_term = FrameVariantsParser.__get_preposition(terms=lemmas,
index=start_ind)

if frame_variants_iter is None:
return parsed_text
yield TextFrameVariant(
variant=frame_variants.get_variant_by_value(ctx_value),
start_index=start_ind,
is_inverted=locale_mods.is_negation_word(prep_term) if prep_term is not None else False)

# TODO. 218 Return updated terms only
updated_terms = FrameVariantsParser.__insert_frame_variants_into_raw_terms_list(
raw_terms_list=list(parsed_text.iter_terms(TermFormat.Raw)),
frame_variants_iter=frame_variants_iter)
break

# TODO. Remove parsed text from here.
return parsed_text.copy_modified(terms=updated_terms)
start_ind = last_ind + 1
42 changes: 40 additions & 2 deletions arekit/processing/text/parsed.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import collections

from arekit.processing.lemmatization.base import Stemmer
from arekit.processing.text.enums import TermFormat

Expand Down Expand Up @@ -28,15 +30,51 @@ def __init__(self, terms, stemmer=None):
# TODO. leave here.
self.__stemmer = stemmer

self.__update_lemmatization()

def __update_lemmatization(self):
# TODO. leave here.
if stemmer is not None:
self.__lemmatize(stemmer)
if self.__stemmer is not None:
self.__lemmatize(self.__stemmer)

# TODO. to base as an abstract-like method. (implementation here)
def copy_modified(self, terms):
return ParsedText(terms=terms,
stemmer=self.__stemmer)

def modify_by_bounded_objects(self, modified_objs, get_obj_bound_func):
assert(isinstance(modified_objs, collections.Iterable))
assert(callable(get_obj_bound_func))

def __remove(terms, start, end):
while end > start:
del terms[start]
end -= 1

if modified_objs is None:
return

objs_list = list(modified_objs)

# setup default position.
prev_position = len(objs_list) + 1
local_terms = self.__terms

for obj in reversed(objs_list):
obj_bound = get_obj_bound_func(obj)

if obj_bound.Position > prev_position:
raise Exception("objs list has incorrect order. It is expected that instances "
"ordered by positions (ascending)")

__remove(terms=local_terms,
start=obj_bound.Position,
end=obj_bound.Position + obj_bound.Length)

local_terms.insert(obj_bound.Position, obj)

self.__update_lemmatization()

# endregion

# TODO. move to base.
Expand Down
18 changes: 15 additions & 3 deletions arekit/processing/text/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@

from arekit.common.entities.base import Entity
from arekit.common.frame_variants.collection import FrameVariantsCollection
from arekit.common.languages.mods import BaseLanguageMods
from arekit.common.languages.ru.mods import RussianLanguageMods
from arekit.common.news.base import News
from arekit.common.news.parse_options import NewsParseOptions
from arekit.common.news.parsed.base import ParsedNews
from arekit.common.utils import split_by_whitespaces
from arekit.processing.frames.parser import FrameVariantsParser
from arekit.processing.text.enums import TermFormat
from arekit.processing.text.parsed import ParsedText
from arekit.processing.text.tokens import Tokens
from arekit.processing.text.token import Token
Expand Down Expand Up @@ -45,6 +48,12 @@ def parse_news(news, parse_options):

return parsed_news

@staticmethod
def __to_lemmas(locale_mods, parsed_text):
assert(issubclass(locale_mods, BaseLanguageMods))
return [locale_mods.replace_specific_word_chars(lemma) if isinstance(lemma, str) else lemma
for lemma in parsed_text.iter_terms(term_format=TermFormat.Lemma)]

# region private methods

@staticmethod
Expand Down Expand Up @@ -92,9 +101,12 @@ def __parse_frame_variants(parsed_news, frame_variant_collection):
return

parsed_news.modify_parsed_sentences(
lambda sentence: FrameVariantsParser.parse_frames_in_parsed_text(
frame_variants_collection=frame_variant_collection,
parsed_text=sentence))
sentence_objs_upd_func=lambda sentence:
FrameVariantsParser.iter_frames_from_lemmas(
frame_variants=frame_variant_collection,
lemmas=TextParser.__to_lemmas(locale_mods=RussianLanguageMods,
parsed_text=sentence)),
get_obj_bound_func=lambda variant: variant.get_bound())

@staticmethod
def __parse_string_list(terms_iter, skip_term, stemmer=None):
Expand Down

0 comments on commit af01adb

Please sign in to comment.