-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
79 additions
and
55 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
from arekit.common.languages.ru.mods import RussianLanguageMods | ||
from arekit.common.text.pipeline_ctx import PipelineContext | ||
from arekit.common.text.stemmer import Stemmer | ||
from arekit.processing.text.pipeline_frames import FrameVariantsParser | ||
|
||
|
||
class LemmasBasedFrameVariantsParser(FrameVariantsParser): | ||
|
||
def __init__(self, frame_variants, stemmer, locale_mods=RussianLanguageMods, save_lemmas=False): | ||
assert(isinstance(stemmer, Stemmer)) | ||
assert(isinstance(save_lemmas, bool)) | ||
|
||
super(LemmasBasedFrameVariantsParser, self).__init__(locale_mods=locale_mods, | ||
frame_variants=frame_variants) | ||
|
||
self.__frame_variants = frame_variants | ||
self.__stemmer = stemmer | ||
self.__save_lemmas = save_lemmas | ||
self.__max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()]) | ||
|
||
def __lemmatize_term(self, term): | ||
# we first split onto words for lemmatization and then join all of them. | ||
lemma = "".join(self.__stemmer.lemmatize_to_list(term)) | ||
# then we replace certain chars according to the locale restrictions. | ||
return self._locale_mods.replace_specific_word_chars(lemma) | ||
|
||
def __lemmatize_terms(self, terms): | ||
""" | ||
Compose a list of lemmatized versions of parsed_news | ||
PS: Might be significantly slow, depending on stemmer were used. | ||
""" | ||
assert(isinstance(terms, list)) | ||
return [self.__lemmatize_term(term) if isinstance(term, str) else term for term in terms] | ||
|
||
def apply(self, pipeline_ctx): | ||
assert(isinstance(pipeline_ctx, PipelineContext)) | ||
|
||
# extract terms. | ||
terms = pipeline_ctx.provide("src") | ||
lemmas = self.__lemmatize_terms(terms) | ||
|
||
processed_it = self._iter_processed(terms=lemmas, | ||
origin=lemmas if self.__save_lemmas else terms) | ||
|
||
# update the result. | ||
pipeline_ctx.update("src", value=list(processed_it)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,22 @@ | ||
from arekit.common.frames.text_variant import TextFrameVariant | ||
from arekit.common.text.pipeline_ctx import PipelineContext | ||
from arekit.processing.lemmatization.mystem import MystemWrapper | ||
from arekit.processing.text.pipeline_frames import LemmasBasedFrameVariantsParser | ||
from arekit.processing.text.pipeline_frames_lemmatized import LemmasBasedFrameVariantsParser | ||
from examples.repository import create_frame_variants_collection | ||
|
||
frame_variants_collection = create_frame_variants_collection() | ||
stemmer = MystemWrapper() | ||
p = LemmasBasedFrameVariantsParser(save_lemmas=False, | ||
stemmer=stemmer, | ||
frame_variants=frame_variants_collection) | ||
|
||
ctx = PipelineContext(d={"src": "мы пытались его осудить но не получилось".split()}) | ||
if __name__ == '__main__': | ||
|
||
p.apply(ctx) | ||
frame_variants_collection = create_frame_variants_collection() | ||
stemmer = MystemWrapper() | ||
p = LemmasBasedFrameVariantsParser(save_lemmas=False, | ||
stemmer=stemmer, | ||
frame_variants=frame_variants_collection) | ||
|
||
for t in ctx.provide("src"): | ||
s = "[{}]".format(t.Variant.get_value()) if isinstance(t, TextFrameVariant) else t | ||
print(s) | ||
ctx = PipelineContext(d={"src": "мы пытались его осудить но не получилось".split()}) | ||
|
||
p.apply(ctx) | ||
|
||
for t in ctx.provide("src"): | ||
s = "[{}]".format(t.Variant.get_value()) if isinstance(t, TextFrameVariant) else t | ||
print(s) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters