Skip to content

Commit

Permalink
#266, task 2, done
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Dec 16, 2021
1 parent a98021b commit 0501d6e
Show file tree
Hide file tree
Showing 9 changed files with 79 additions and 55 deletions.
4 changes: 1 addition & 3 deletions arekit/contrib/experiment_rusentrel/common.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from arekit.common.entities.base import Entity
from arekit.common.experiment.api.ctx_serialization import SerializationData
from arekit.common.frames.variants.collection import FrameVariantsCollection
from arekit.common.synonyms import SynonymsCollection
from arekit.common.text.parser import BaseTextParser
from arekit.common.text.pipeline_item import TextParserPipelineItem
from arekit.common.text.stemmer import Stemmer
from arekit.processing.text.pipeline_frames import LemmasBasedFrameVariantsParser
from arekit.processing.text.pipeline_frames_lemmatized import LemmasBasedFrameVariantsParser
from arekit.processing.text.pipeline_tokenizer import DefaultTextTokenizer


Expand Down
41 changes: 9 additions & 32 deletions arekit/processing/text/pipeline_frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,20 @@
from arekit.common.languages.ru.mods import RussianLanguageMods
from arekit.common.text.pipeline_ctx import PipelineContext
from arekit.common.text.pipeline_item import TextParserPipelineItem
from arekit.common.text.stemmer import Stemmer


class LemmasBasedFrameVariantsParser(TextParserPipelineItem):
class FrameVariantsParser(TextParserPipelineItem):

def __init__(self, frame_variants, stemmer, locale_mods=RussianLanguageMods, save_lemmas=False):
def __init__(self, frame_variants, locale_mods=RussianLanguageMods):
assert(isinstance(frame_variants, FrameVariantsCollection))
assert(len(frame_variants) > 0)
assert(issubclass(locale_mods, BaseLanguageMods))
assert(isinstance(stemmer, Stemmer))
assert(isinstance(save_lemmas, bool))

super(LemmasBasedFrameVariantsParser, self).__init__()
super(FrameVariantsParser, self).__init__()

self.__frame_variants = frame_variants
self.__stemmer = stemmer
self.__locale_mods = locale_mods
self.__save_lemmas = save_lemmas
self.__max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()])
self._locale_mods = locale_mods

# region private methods

Expand All @@ -37,26 +32,12 @@ def __check_all_terms_within(terms, start_index, last_index):
def __get_preposition(terms, index):
return terms[index-1] if index > 0 else None

def __lemmatize_term(self, term):
# we first split onto words for lemmatization and then join all of them.
lemma = "".join(self.__stemmer.lemmatize_to_list(term))
# then we replace certain chars according to the locale restrictions.
return self.__locale_mods.replace_specific_word_chars(lemma)

def __lemmatize_terms(self, terms):
"""
Compose a list of lemmatized versions of parsed_news
PS: Might be significantly slow, depending on stemmer were used.
"""
assert(isinstance(terms, list))
return [self.__lemmatize_term(term) if isinstance(term, str) else term for term in terms]

def __try_compose_frame_variant(self, lemmas, start_ind, last_ind):

if last_ind >= len(lemmas):
return None

is_all_words_within = LemmasBasedFrameVariantsParser.__check_all_terms_within(
is_all_words_within = self.__check_all_terms_within(
terms=lemmas,
start_index=start_ind,
last_index=last_ind)
Expand All @@ -71,7 +52,7 @@ def __try_compose_frame_variant(self, lemmas, start_ind, last_ind):

return ctx_value

def __iter_processed(self, terms, origin):
def _iter_processed(self, terms, origin):
assert(len(terms) == len(origin))

start_ind = 0
Expand All @@ -92,13 +73,12 @@ def __iter_processed(self, terms, origin):
if ctx_value is None:
continue

prep_term = LemmasBasedFrameVariantsParser.__get_preposition(terms=terms,
index=start_ind)
prep_term = self.__get_preposition(terms=terms, index=start_ind)

yield TextFrameVariant(
variant=self.__frame_variants.get_variant_by_value(ctx_value),
start_index=start_ind,
is_inverted=self.__locale_mods.is_negation_word(prep_term) if prep_term is not None else False)
is_inverted=self._locale_mods.is_negation_word(prep_term) if prep_term is not None else False)

found = True

Expand All @@ -116,10 +96,7 @@ def apply(self, pipeline_ctx):

# extract terms.
terms = pipeline_ctx.provide("src")
lemmas = self.__lemmatize_terms(terms)

processed_it = self.__iter_processed(terms=lemmas,
origin=lemmas if self.__save_lemmas else terms)
processed_it = self._iter_processed(terms=terms, origin=terms)

# update the result.
pipeline_ctx.update("src", value=list(processed_it))
46 changes: 46 additions & 0 deletions arekit/processing/text/pipeline_frames_lemmatized.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from arekit.common.languages.ru.mods import RussianLanguageMods
from arekit.common.text.pipeline_ctx import PipelineContext
from arekit.common.text.stemmer import Stemmer
from arekit.processing.text.pipeline_frames import FrameVariantsParser


class LemmasBasedFrameVariantsParser(FrameVariantsParser):

def __init__(self, frame_variants, stemmer, locale_mods=RussianLanguageMods, save_lemmas=False):
assert(isinstance(stemmer, Stemmer))
assert(isinstance(save_lemmas, bool))

super(LemmasBasedFrameVariantsParser, self).__init__(locale_mods=locale_mods,
frame_variants=frame_variants)

self.__frame_variants = frame_variants
self.__stemmer = stemmer
self.__save_lemmas = save_lemmas
self.__max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()])

def __lemmatize_term(self, term):
# we first split onto words for lemmatization and then join all of them.
lemma = "".join(self.__stemmer.lemmatize_to_list(term))
# then we replace certain chars according to the locale restrictions.
return self._locale_mods.replace_specific_word_chars(lemma)

def __lemmatize_terms(self, terms):
"""
Compose a list of lemmatized versions of parsed_news
PS: Might be significantly slow, depending on stemmer were used.
"""
assert(isinstance(terms, list))
return [self.__lemmatize_term(term) if isinstance(term, str) else term for term in terms]

def apply(self, pipeline_ctx):
assert(isinstance(pipeline_ctx, PipelineContext))

# extract terms.
terms = pipeline_ctx.provide("src")
lemmas = self.__lemmatize_terms(terms)

processed_it = self._iter_processed(terms=lemmas,
origin=lemmas if self.__save_lemmas else terms)

# update the result.
pipeline_ctx.update("src", value=list(processed_it))
2 changes: 1 addition & 1 deletion examples/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from arekit.contrib.source.rusentiframes.types import RuSentiFramesVersions
from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions
from arekit.processing.lemmatization.mystem import MystemWrapper
from arekit.processing.text.pipeline_frames import LemmasBasedFrameVariantsParser
from arekit.processing.text.pipeline_frames_lemmatized import LemmasBasedFrameVariantsParser
from arekit.processing.text.pipeline_tokenizer import DefaultTextTokenizer
from examples.input import EXAMPLES
from examples.network.utils import SingleDocOperations, CustomOpinionOperations, CustomSerializationData, \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import unittest
from pymystem3 import Mystem


sys.path.append('../../../../')

from tests.text.utils import terms_to_str
Expand All @@ -28,8 +29,8 @@
from arekit.processing.pos.mystem_wrap import POSMystemWrapper
from arekit.processing.lemmatization.mystem import MystemWrapper
from arekit.processing.text.token import Token
from arekit.processing.text.pipeline_frames import LemmasBasedFrameVariantsParser
from arekit.processing.text.pipeline_tokenizer import DefaultTextTokenizer
from arekit.processing.text.pipeline_frames_lemmatized import LemmasBasedFrameVariantsParser


class TestRuSentRelOpinionsIter(unittest.TestCase):
Expand Down
6 changes: 3 additions & 3 deletions tests/contrib/networks/test_tf_input_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import numpy as np
from pymystem3 import Mystem

from arekit.processing.text.pipeline_frames import LemmasBasedFrameVariantsParser

sys.path.append('../../../')

from tests.contrib.networks.text.news import init_rusentrel_doc
Expand All @@ -29,6 +27,7 @@
from arekit.processing.lemmatization.mystem import MystemWrapper
from arekit.processing.pos.mystem_wrap import POSMystemWrapper
from arekit.processing.text.pipeline_tokenizer import DefaultTextTokenizer
from arekit.processing.text.pipeline_frames_lemmatized import LemmasBasedFrameVariantsParser


class TestTfInputFeatures(unittest.TestCase):
Expand Down Expand Up @@ -58,7 +57,8 @@ def test(self):
RuSentRelTextEntitiesParser(),
DefaultTextTokenizer(keep_tokens=True),
LemmasBasedFrameVariantsParser(frame_variants=self.unique_frame_variants,
stemmer=self.stemmer)
stemmer=self.stemmer,
save_lemmas=True)
])

random.seed(10)
Expand Down
2 changes: 0 additions & 2 deletions tests/contrib/source/test_ruattitudes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import unittest
from tqdm import tqdm

from arekit.processing.text.pipeline_frames import LemmasBasedFrameVariantsParser

sys.path.append('../../../../')

from arekit.common.opinions.base import Opinion
Expand Down
25 changes: 14 additions & 11 deletions tests/processing/test_frames_annotation.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
from arekit.common.frames.text_variant import TextFrameVariant
from arekit.common.text.pipeline_ctx import PipelineContext
from arekit.processing.lemmatization.mystem import MystemWrapper
from arekit.processing.text.pipeline_frames import LemmasBasedFrameVariantsParser
from arekit.processing.text.pipeline_frames_lemmatized import LemmasBasedFrameVariantsParser
from examples.repository import create_frame_variants_collection

frame_variants_collection = create_frame_variants_collection()
stemmer = MystemWrapper()
p = LemmasBasedFrameVariantsParser(save_lemmas=False,
stemmer=stemmer,
frame_variants=frame_variants_collection)

ctx = PipelineContext(d={"src": "мы пытались его осудить но не получилось".split()})
if __name__ == '__main__':

p.apply(ctx)
frame_variants_collection = create_frame_variants_collection()
stemmer = MystemWrapper()
p = LemmasBasedFrameVariantsParser(save_lemmas=False,
stemmer=stemmer,
frame_variants=frame_variants_collection)

for t in ctx.provide("src"):
s = "[{}]".format(t.Variant.get_value()) if isinstance(t, TextFrameVariant) else t
print(s)
ctx = PipelineContext(d={"src": "мы пытались его осудить но не получилось".split()})

p.apply(ctx)

for t in ctx.provide("src"):
s = "[{}]".format(t.Variant.get_value()) if isinstance(t, TextFrameVariant) else t
print(s)
5 changes: 3 additions & 2 deletions tests/processing/test_text_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from arekit.common.news.parser import NewsParser
from arekit.common.text.parser import BaseTextParser
from arekit.contrib.source.rusentrel.entities.parser import RuSentRelTextEntitiesParser
from arekit.processing.text.pipeline_frames_lemmatized import LemmasBasedFrameVariantsParser

from arekit.processing.text.pipeline_frames import LemmasBasedFrameVariantsParser
from arekit.processing.text.pipeline_tokenizer import DefaultTextTokenizer
from tests.processing.text.debug_text import debug_show_news_terms

Expand Down Expand Up @@ -41,7 +41,8 @@ def test_parsing(self):
text_parser = BaseTextParser(pipeline=[RuSentRelTextEntitiesParser(),
DefaultTextTokenizer(keep_tokens=True),
LemmasBasedFrameVariantsParser(frame_variants=frame_variants,
stemmer=stemmer)])
stemmer=stemmer,
save_lemmas=False)])

# Reading synonyms collection.
synonyms = RuSentRelSynonymsCollectionProvider.load_collection(stemmer=stemmer)
Expand Down

0 comments on commit 0501d6e

Please sign in to comment.