Skip to content

Commit

Permalink
#224. Huge refactoring. Now frames are not supported as a part of the…
Browse files Browse the repository at this point in the history
… text processing pipeline
  • Loading branch information
nicolay-r committed Dec 15, 2021
1 parent a38c182 commit 9852499
Show file tree
Hide file tree
Showing 22 changed files with 196 additions and 185 deletions.
19 changes: 3 additions & 16 deletions arekit/common/news/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,26 +24,13 @@ def SentencesCount(self):

# endregion

# region protected methods

@staticmethod
def _sentence_to_terms_list_core(sentence):
"""
pipeline processing application towards the particular sentence.
"""
raise NotImplementedError()

# endregion

def sentence_to_terms_list(self, sent_ind):
assert(isinstance(sent_ind, int))
sentence = self._sentences[sent_ind]
return self._sentence_to_terms_list_core(sentence)

def iter_sentences(self):
for sentence in self._sentences:
yield sentence

def get_sentence(self, s_ind):
return self._sentences[s_ind]

def extract_linked_text_opinions(self, opinion):
"""
opinions: iterable Opinion
Expand Down
15 changes: 10 additions & 5 deletions arekit/common/news/objects_parser.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
from arekit.common.bound import Bound
from arekit.common.news.sentence import BaseNewsSentence
from arekit.common.text.pipeline_ctx import PipelineContext
from arekit.common.text.pipeline_item import TextParserPipelineItem


class BaseObjectsParser(object):
class SentenceObjectsParserPipelineItem(TextParserPipelineItem):

def __init__(self, iter_objs_func):
assert(callable(iter_objs_func))
self.__iter_objs_func = iter_objs_func

def parse(self, sentence):
assert(isinstance(sentence, BaseNewsSentence))
def apply(self, pipeline_ctx):
assert(isinstance(pipeline_ctx, PipelineContext))
assert("sentence" in pipeline_ctx)

sentence = pipeline_ctx.provide("sentence")

start = 0
entries = []
Expand All @@ -35,7 +39,8 @@ def parse(self, sentence):
last_part = sentence.Text[start:len(sentence.Text)]
entries.extend(last_part)

return entries
# update information in pipeline
pipeline_ctx.update("src", entries)

def __enter__(self):
return self
Expand Down
18 changes: 17 additions & 1 deletion arekit/common/news/parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from arekit.common.news.base import News
from arekit.common.news.parsed.base import ParsedNews
from arekit.common.text.parser import BaseTextParser
from arekit.common.text.pipeline_ctx import PipelineContext


class NewsParser(object):
Expand All @@ -10,8 +11,23 @@ def parse(news, text_parser):
assert(isinstance(news, News))
assert(isinstance(text_parser, BaseTextParser))

parsed_sentences = [text_parser.parse(news.sentence_to_terms_list(sent_ind))
parsed_sentences = [text_parser.parse(NewsParser.__create_pipeline_ctx(news, sent_ind))
for sent_ind in range(news.SentencesCount)]

return ParsedNews(doc_id=news.ID,
parsed_sentences=parsed_sentences)

@staticmethod
def __create_pipeline_ctx(news, sent_ind):
""" Default pipeline context.
"""
assert(isinstance(news, News))

sentence = news.get_sentence(sent_ind)

return PipelineContext(d={
"src": sentence.Text, # source data.
"s_ind": sent_ind, # sentence index. (as Metadata)
"doc_id": news.ID, # document index. (as Metadata)
"sentence": sentence
})
7 changes: 1 addition & 6 deletions arekit/common/text/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@
# TODO. All inherited types provide the same values for __init__.
class TextParseOptions(object):

def __init__(self, frame_variants_collection, stemmer, keep_tokens=True):
def __init__(self, frame_variants_collection, stemmer):
assert(isinstance(stemmer, Stemmer) or stemmer is None)
assert(isinstance(frame_variants_collection, FrameVariantsCollection) or frame_variants_collection is None)
assert(isinstance(keep_tokens, bool))
self.__frame_variants_collection = frame_variants_collection
self.__stemmer = stemmer
self.__keep_tokens = keep_tokens

# TODO. This class is weird.
# TODO. As this parameter related to the particular text-parser implementation.
Expand All @@ -28,6 +26,3 @@ def ParseFrameVariants(self):
def FrameVariantsCollection(self):
return self.__frame_variants_collection

@property
def KeepTokens(self):
return self.__keep_tokens
74 changes: 17 additions & 57 deletions arekit/common/text/parser.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,51 @@
import collections

from arekit.common.entities.base import Entity
from arekit.common.frames.variants.collection import FrameVariantsCollection

from arekit.common.languages.mods import BaseLanguageMods
from arekit.common.languages.ru.mods import RussianLanguageMods
from arekit.common.text.options import TextParseOptions
from arekit.common.text.parsed import BaseParsedText
from arekit.common.text.pipeline_ctx import PipelineContext
from arekit.common.text.pipeline_item import TextParserPipelineItem
from arekit.processing.frames.parser import FrameVariantsParser
from arekit.processing.text.enums import TermFormat


class BaseTextParser(object):

def __init__(self, parse_options, create_parsed_text_func=None):
"""
create_parsed_text_func: is a function with the following signature:
(terms, options) -> ParsedText
"""
assert(callable(create_parsed_text_func) or create_parsed_text_func is None)
def __init__(self, parse_options, pipeline):
assert(isinstance(parse_options, TextParseOptions))

if create_parsed_text_func is None:
# default implementation
create_parsed_text_func = lambda terms, _: BaseParsedText(terms=terms)

self.__create_parsed_text_func = create_parsed_text_func
assert(isinstance(pipeline, list))
self._parse_options = parse_options
self.__pipeline = pipeline

def parse(self, terms_list):
def parse(self, pipeline_ctx):
"""
terms_list: list of terms
returns:
ParsedText
"""
assert(isinstance(terms_list, list))
assert(isinstance(pipeline_ctx, PipelineContext))

for item in self.__pipeline:
assert(isinstance(item, TextParserPipelineItem))
item.apply(pipeline_ctx)

# Tokenization stage. (PPL 1).
parsed_text = self.__tokenize_terms(terms_list)
# compose parsed text.
parsed_text = BaseParsedText(terms=pipeline_ctx.provide("src"))

# TODO. In further, this is considered to be departed from base text parser
# TODO. and treated as an (optional) element of the text processing pipeline.
# Frames parsing stage. (PPL 2).
if self._parse_options.ParseFrameVariants:
self.__parse_frame_variants(parsed_text=parsed_text,
frame_variant_collection=self._parse_options.FrameVariantsCollection)

return parsed_text

# region protected abstract

def _parse_to_tokens_list(self, text):
raise NotImplementedError()

# endregion

# region private methods

# TODO. In further, this is considered to be departed from base text parser
# TODO. and treated as an (optional) element of the text processing pipeline.
@staticmethod
def __to_lemmas(locale_mods, parsed_text):
assert(issubclass(locale_mods, BaseLanguageMods))
Expand Down Expand Up @@ -83,35 +74,4 @@ def __parse_frame_variants(parsed_text, frame_variant_collection):
modified_objs=objs_it,
get_obj_bound_func=lambda variant: variant.get_bound())

# TODO. In further, this is considered to be departed from base text parser
# TODO. and treated as an (optional) element of the text processing pipeline.
def __tokenize_terms(self, terms_list):
assert(isinstance(terms_list, list))

handled_terms = self.__handle_terms(
terms_iter=terms_list,
skip_term=lambda term: isinstance(term, Entity),
term_handler=lambda term: self._parse_to_tokens_list(term))

# Do parsing.
return self.__create_parsed_text_func(handled_terms,
self._parse_options)

@staticmethod
def __handle_terms(terms_iter, skip_term, term_handler):
assert(isinstance(terms_iter, collections.Iterable))
assert(callable(skip_term))

processed_terms = []
for term in terms_iter:

if skip_term(term):
processed_terms.append(term)
continue

new_terms = term_handler(term)
processed_terms.extend(new_terms)

return processed_terms

# endregion
16 changes: 16 additions & 0 deletions arekit/common/text/pipeline_ctx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
class PipelineContext(object):
""" Context of parameters utilized in pipeline
"""

def __init__(self, d):
assert(isinstance(d, dict))
self._d = d

def provide(self, param):
return self._d[param]

def update(self, param, value):
self._d[param] = value

def __contains__(self, item):
return item in self._d
4 changes: 4 additions & 0 deletions arekit/common/text/pipeline_item.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class TextParserPipelineItem(object):

def apply(self, pipeline_ctx):
raise NotImplementedError()
11 changes: 8 additions & 3 deletions arekit/contrib/experiment_rusentrel/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from arekit.common.experiment.api.ctx_serialization import SerializationData
from arekit.common.synonyms import SynonymsCollection
from arekit.common.text.options import TextParseOptions
from arekit.processing.text.parser import DefaultTextParser
from arekit.common.text.parser import BaseTextParser
from arekit.common.text.pipeline_item import TextParserPipelineItem
from arekit.processing.text.tokenizer import DefaultTextTokenizer


def entity_to_group_func(entity, synonyms):
Expand All @@ -28,7 +30,8 @@ def entity_to_group_func(entity, synonyms):
return synonyms.get_synonym_group_index(value)


def create_text_parser(exp_data):
def create_text_parser(exp_data, entities_parser):
assert(isinstance(entities_parser, TextParserPipelineItem))

if not isinstance(exp_data, SerializationData):
# We do not utlize text_parser in such case.
Expand All @@ -37,4 +40,6 @@ def create_text_parser(exp_data):
parse_options = TextParseOptions(stemmer=exp_data.Stemmer,
frame_variants_collection=exp_data.FrameVariantCollection)

return DefaultTextParser(parse_options=parse_options)
return BaseTextParser(parse_options=parse_options,
pipeline=[entities_parser,
DefaultTextTokenizer(keep_tokens=True)])
5 changes: 4 additions & 1 deletion arekit/contrib/experiment_rusentrel/exp_ds/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from arekit.contrib.experiment_rusentrel.exp_ds.folding import create_ruattitudes_experiment_data_folding
from arekit.contrib.experiment_rusentrel.exp_ds.opinions import RuAttitudesOpinionOperations
from arekit.contrib.experiment_rusentrel.exp_ds.utils import read_ruattitudes_in_memory
from arekit.contrib.source.ruattitudes.entity.parser import RuAttitudesTextEntitiesParser
from arekit.contrib.source.ruattitudes.io_utils import RuAttitudesVersions

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -43,7 +44,9 @@ def __init__(self, exp_data, experiment_io_type, version, load_docs, extra_name_
self.log_info("Create document operations ... ")
doc_ops = RuAttitudesDocumentOperations(folding=folding,
ru_attitudes=ru_attitudes,
text_parser=create_text_parser(exp_data))
text_parser=create_text_parser(
exp_data=exp_data,
entities_parser=RuAttitudesTextEntitiesParser()))

self.log_info("Create opinion operations ... ")
opin_ops = RuAttitudesOpinionOperations(ru_attitudes=ru_attitudes)
Expand Down
5 changes: 4 additions & 1 deletion arekit/contrib/experiment_rusentrel/exp_joined/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from arekit.contrib.experiment_rusentrel.exp_sl.opinions import RuSentrelOpinionOperations
from arekit.contrib.experiment_rusentrel.synonyms.provider import RuSentRelSynonymsCollectionProvider
from arekit.contrib.source.ruattitudes.io_utils import RuAttitudesVersions
from arekit.contrib.source.rusentrel.entities.parser import RuSentRelTextEntitiesParser
from arekit.contrib.source.rusentrel.io_utils import RuSentRelVersions

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -61,7 +62,9 @@ def __init__(self, exp_data, experiment_io_type, folding_type, ruattitudes_versi
experiment_io=experiment_io)

# init text parser.
text_parser = create_text_parser(self.__exp_data)
# TODO. Limitation, depending on document, entities parser may vary.
text_parser = create_text_parser(exp_data=self.__exp_data,
entities_parser=RuSentRelTextEntitiesParser())

# init documents.
rusentrel_doc = RuSentrelDocumentOperations(version=rusentrel_version,
Expand Down
4 changes: 2 additions & 2 deletions arekit/contrib/source/ruattitudes/entity/parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from arekit.common.news.objects_parser import BaseObjectsParser
from arekit.common.news.objects_parser import SentenceObjectsParserPipelineItem
from arekit.contrib.source.ruattitudes.text_object import TextObject


class RuAttitudesTextEntitiesParser(BaseObjectsParser):
class RuAttitudesTextEntitiesParser(SentenceObjectsParserPipelineItem):

def __init__(self):
super(RuAttitudesTextEntitiesParser, self).__init__(
Expand Down
6 changes: 0 additions & 6 deletions arekit/contrib/source/ruattitudes/news/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from arekit.common.linked.text_opinions.wrapper import LinkedTextOpinionsWrapper
from arekit.common.news.base import News
from arekit.common.opinions.base import Opinion
from arekit.contrib.source.ruattitudes.entity.parser import RuAttitudesTextEntitiesParser
from arekit.contrib.source.ruattitudes.sentence.base import RuAttitudesSentence


Expand Down Expand Up @@ -58,11 +57,6 @@ def extract_linked_text_opinions(self, opinion):
assert(isinstance(opinion, Opinion))
return LinkedTextOpinionsWrapper(self.__iter_all_text_opinions_in_sentences(opinion=opinion))

@staticmethod
def _sentence_to_terms_list_core(sentence):
with RuAttitudesTextEntitiesParser() as parser:
return parser.parse(sentence)

# endregion

# region Private methods
Expand Down
4 changes: 2 additions & 2 deletions arekit/contrib/source/rusentrel/entities/parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from arekit.common.news.objects_parser import BaseObjectsParser
from arekit.common.news.objects_parser import SentenceObjectsParserPipelineItem
from arekit.contrib.source.rusentrel.sentence import RuSentRelSentence


class RuSentRelTextEntitiesParser(BaseObjectsParser):
class RuSentRelTextEntitiesParser(SentenceObjectsParserPipelineItem):

def __init__(self):
super(RuSentRelTextEntitiesParser, self).__init__(
Expand Down
6 changes: 0 additions & 6 deletions arekit/contrib/source/rusentrel/news/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from arekit.contrib.source.rusentrel.entities.entity import RuSentRelEntity
from arekit.contrib.source.rusentrel.entities.collection import RuSentRelDocumentEntityCollection
from arekit.contrib.source.rusentrel.entities.parser import RuSentRelTextEntitiesParser
from arekit.contrib.source.rusentrel.io_utils import RuSentRelIOUtils, RuSentRelVersions
from arekit.contrib.source.rusentrel.opinions.extraction import iter_text_opinions_by_doc_opinion
from arekit.contrib.source.rusentrel.sentence import RuSentRelSentence
Expand Down Expand Up @@ -126,9 +125,4 @@ def extract_linked_text_opinions(self, opinion):

return LinkedTextOpinionsWrapper(linked_text_opinions=opinions_it)

@staticmethod
def _sentence_to_terms_list_core(sentence):
with RuSentRelTextEntitiesParser() as parser:
return parser.parse(sentence)

# endregion
Loading

0 comments on commit 9852499

Please sign in to comment.