Skip to content

Commit

Permalink
#473 framework generalization refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed May 15, 2023
1 parent 9d88605 commit f4ed4d6
Show file tree
Hide file tree
Showing 9 changed files with 43 additions and 38 deletions.
9 changes: 9 additions & 0 deletions arekit/common/data/input/providers/contents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

class ContentsProvider(object):
""" This is a main provider of the contents of further sampled output.
"""

def from_doc_ids(self, doc_ids, idle_mode=False):
""" This is a main method is expected to be implemented.
"""
pass
22 changes: 10 additions & 12 deletions arekit/common/data/input/providers/rows/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import collections
import logging

from arekit.common.data.input.providers.opinions import InputTextOpinionProvider
from arekit.common.linkage.text_opinions import TextOpinionsLinkage
from arekit.common.data.input.providers.contents import ContentsProvider
from arekit.common.linkage.base import LinkedDataWrapper
from arekit.common.news.parsed.providers.entity_service import EntityServiceProvider
from arekit.common.news.parsed.service import ParsedNewsService

Expand All @@ -22,22 +22,20 @@ def _provide_rows(self, parsed_news, entity_service, text_opinion_linkage, idle_

# endregion

# TODO. Limitation, this is now focused on linked text opinions only!
# TODO. In general, this might be any provider, not only opinion-related.
def iter_by_rows(self, opinion_provider, doc_ids_iter, idle_mode):
assert(isinstance(opinion_provider, InputTextOpinionProvider))
def iter_by_rows(self, contents_provider, doc_ids_iter, idle_mode):
assert(isinstance(contents_provider, ContentsProvider))
assert(isinstance(doc_ids_iter, collections.Iterable))

for linkage in opinion_provider.iter_linked_opinions(doc_ids=doc_ids_iter, idle_mode=idle_mode):
assert(isinstance(linkage, TextOpinionsLinkage))
assert(isinstance(linkage.Tag, ParsedNewsService))
for linked_data in contents_provider.from_doc_ids(doc_ids=doc_ids_iter, idle_mode=idle_mode):
assert(isinstance(linked_data, LinkedDataWrapper))
assert(isinstance(linked_data.Tag, ParsedNewsService))

parsed_news_service = linkage.Tag
parsed_news_service = linked_data.Tag

rows_it = self._provide_rows(parsed_news=parsed_news_service.ParsedNews,
entity_service=parsed_news_service.get_provider(EntityServiceProvider.NAME),
text_opinion_linkage=linkage,
text_opinion_linkage=linked_data,
idle_mode=idle_mode)

for row in rows_it:
yield linkage.RelatedDocID, row
yield linked_data.RelatedDocID, row
10 changes: 4 additions & 6 deletions arekit/common/data/input/repositories/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
from arekit.common.data.input.providers.opinions import InputTextOpinionProvider
from arekit.common.data.input.providers.contents import ContentsProvider
from arekit.common.data.input.providers.rows.base import BaseRowProvider
from arekit.common.data.storages.base import BaseRowsStorage
from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
Expand Down Expand Up @@ -31,18 +31,16 @@ def _setup_rows_provider(self):

# endregion

# TODO. Generailze, TextOpinion -> Any provider.
def populate(self, opinion_provider, doc_ids, desc="", writer=None, target=None):
# TODO. Generailze, TextOpinion -> Any provider.
assert(isinstance(opinion_provider, InputTextOpinionProvider))
def populate(self, contents_provider, doc_ids, desc="", writer=None, target=None):
assert(isinstance(contents_provider, ContentsProvider))
assert(isinstance(self._storage, BaseRowsStorage))
assert(isinstance(doc_ids, list))
assert(isinstance(writer, BaseWriter) or writer is None)
assert(isinstance(target, str) or target is None)

def iter_rows(idle_mode):
return self._rows_provider.iter_by_rows(
opinion_provider=opinion_provider,
contents_provider=contents_provider,
doc_ids_iter=doc_ids,
idle_mode=idle_mode)

Expand Down
14 changes: 14 additions & 0 deletions arekit/common/linkage/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,25 @@ class LinkedDataWrapper(object):
def __init__(self, linked_data):
assert(isinstance(linked_data, collections.Iterable))
self.__linked_data = list(linked_data)
self.__tag = None

@property
def RelatedDocID(self):
""" Linked data is limited to the particular document.
"""
raise NotImplementedError()

@property
def First(self):
return self[0]

@property
def Tag(self):
return self.__tag

def set_tag(self, value):
self.__tag = value

def _get_data_label(self, item):
raise NotImplementedError()

Expand Down
11 changes: 0 additions & 11 deletions arekit/common/linkage/text_opinions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,12 @@

class TextOpinionsLinkage(LinkedDataWrapper):

def __init__(self, text_opinions_it):
super(TextOpinionsLinkage, self).__init__(linked_data=text_opinions_it)
self.__tag = None

def set_tag(self, value):
self.__tag = value

@property
def First(self):
first = super(TextOpinionsLinkage, self).First
assert(isinstance(first, TextOpinion))
return first

@property
def Tag(self):
return self.__tag

@property
def RelatedDocID(self):
return self.First.DocID
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from arekit.common.data.input.providers.const import IDLE_MODE
from arekit.common.data.input.providers.contents import ContentsProvider
from arekit.common.linkage.text_opinions import TextOpinionsLinkage
from arekit.common.pipeline.base import BasePipeline
from arekit.common.text_opinions.base import TextOpinion


# TODO. Add base class. This should be nested from the ContentsProvider.
# TODO. InputTextOpinionProvider -> ContentsProvider. (Add the latter)
class InputTextOpinionProvider(object):
class InputTextOpinionProvider(ContentsProvider):

def __init__(self, pipeline):
""" NOTE: it is important that the output of the pipeline
Expand All @@ -28,8 +27,7 @@ def __assign_ids(self, linkage):
text_opinion.set_text_opinion_id(self.__current_id)
self.__current_id += 1

# TODO. rename.
def iter_linked_opinions(self, doc_ids, idle_mode=False):
def from_doc_ids(self, doc_ids, idle_mode=False):
self.__current_id = 0
for linkage in self.__pipeline.run(doc_ids, params_dict={IDLE_MODE: idle_mode}):
assert(isinstance(linkage, TextOpinionsLinkage))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def create_text_opinion_extraction_pipeline(rusentrel_version,
dist_in_sentences=0):
""" Processing pipeline for RuSentRel, which combines:
- predefined document-level annotation (sentiment labels)
- automatic annotation of optinions between mentioned named entities (no-label)
- automatic annotation of opinions between mentioned named entities (no-label)
Original collection paper: arxiv.org/abs/1808.08932
Expand Down
5 changes: 2 additions & 3 deletions arekit/contrib/utils/serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@
from arekit.common.data import const

from arekit.common.data.input.providers.columns.sample import SampleColumnsProvider
from arekit.common.data.input.providers.opinions import InputTextOpinionProvider
from arekit.common.data.input.providers.rows.base import BaseRowProvider
from arekit.common.data.input.repositories.base import BaseInputRepository
from arekit.common.data.input.repositories.sample import BaseInputSamplesRepository
from arekit.common.data.storages.base import BaseRowsStorage
from arekit.common.pipeline.base import BasePipeline
from arekit.contrib.utils.data.contents.opinions import InputTextOpinionProvider
from arekit.contrib.utils.data.service.balance import StorageBalancing
from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
Expand All @@ -38,7 +37,7 @@ def fill_and_write(pipeline, repo, target, writer, doc_ids_iter, desc="", do_bal

doc_ids = list(doc_ids_iter)

repo.populate(opinion_provider=InputTextOpinionProvider(pipeline),
repo.populate(contents_provider=InputTextOpinionProvider(pipeline),
doc_ids=doc_ids,
desc=desc,
writer=writer,
Expand Down

0 comments on commit f4ed4d6

Please sign in to comment.