Skip to content

Commit

Permalink
#510 fixed/hack -- using new type of the meta-rows, that optionaly se…
Browse files Browse the repository at this point in the history
…rves the meta-information.
  • Loading branch information
nicolay-r committed Sep 16, 2023
1 parent e6ec190 commit eb52e3c
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 22 deletions.
32 changes: 21 additions & 11 deletions arekit/common/data/input/providers/rows/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from arekit.common.data.input.providers.contents import ContentsProvider
from arekit.common.linkage.base import LinkedDataWrapper
from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider
from arekit.common.docs.parsed.service import ParsedDocumentService
from arekit.common.linkage.meta import MetaEmptyLinkedDataWrapper

logger = logging.getLogger(__name__)

Expand All @@ -30,6 +30,13 @@ def _count_row(self):

# endregion

def __iter_rows(self, linked_data, idle_mode):
parsed_doc_service = linked_data.Tag
return self._provide_rows(parsed_doc=parsed_doc_service.ParsedDocument,
entity_service=parsed_doc_service.get_provider(EntityServiceProvider.NAME),
text_opinion_linkage=linked_data,
idle_mode=idle_mode)

def iter_by_rows(self, contents_provider, doc_ids_iter, idle_mode):
assert(isinstance(contents_provider, ContentsProvider))
assert(isinstance(doc_ids_iter, collections.Iterable))
Expand All @@ -38,16 +45,19 @@ def iter_by_rows(self, contents_provider, doc_ids_iter, idle_mode):

for linked_data in contents_provider.from_doc_ids(doc_ids=doc_ids_iter, idle_mode=idle_mode):
assert(isinstance(linked_data, LinkedDataWrapper))
assert(isinstance(linked_data.Tag, ParsedDocumentService))

parsed_doc_service = linked_data.Tag

rows_it = self._provide_rows(parsed_doc=parsed_doc_service.ParsedDocument,
entity_service=parsed_doc_service.get_provider(EntityServiceProvider.NAME),
text_opinion_linkage=linked_data,
idle_mode=idle_mode)

for row in rows_it:
yield linked_data.RelatedDocID, row
if isinstance(linked_data, MetaEmptyLinkedDataWrapper):
if idle_mode:
# In the case of the IDLE mode we do not consider the meta-data.
data_it = []
else:
# Consider the actual linked data instance.
data_it = [linked_data]
else:
# Consider the actual rows of the related linked data.
data_it = self.__iter_rows(linked_data=linked_data, idle_mode=idle_mode)

for data in data_it:
yield linked_data.RelatedDocID, data

self.__rows_counter = None
25 changes: 16 additions & 9 deletions arekit/common/data/storages/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging

from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
from arekit.common.linkage.meta import MetaEmptyLinkedDataWrapper
from arekit.common.utils import progress_bar

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -70,16 +71,25 @@ def fill(self, iter_rows_func, columns_provider, row_handler=None, rows_count=No

doc_ids_seen = set()

for row_index, row in enumerate(pbar_it):
row_index = 0
for row in pbar_it:

doc_id, row_values = row

self._begin_filling_row(row_index)
if isinstance(row_values, MetaEmptyLinkedDataWrapper):
# Do nothing, i.e. do not register the related row.
pass
else:
self._begin_filling_row(row_index)
for column, value in row_values.items():
self._set_row_value(row_ind=row_index,
column=column,
value=value)

for column, value in row_values.items():
self._set_row_value(row_ind=row_index,
column=column,
value=value)
if row_handler is not None:
row_handler()

row_index += 1

# Provide information about amount of processed documents.
doc_ids_seen.add(doc_id)
Expand All @@ -88,9 +98,6 @@ def fill(self, iter_rows_func, columns_provider, row_handler=None, rows_count=No
"doc_now": str(doc_id),
})

if row_handler is not None:
row_handler()

def free(self):
gc.collect()

Expand Down
23 changes: 23 additions & 0 deletions arekit/common/linkage/meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from arekit.common.linkage.base import LinkedDataWrapper


class MetaEmptyLinkedDataWrapper(LinkedDataWrapper):
""" This is a placeholder data-wrapper utilized for passing system information
while iterating through the data pipelines.
"""

def __init__(self, doc_id, meta_data=None):
""" meta_data:
optional parameter which serves any information need in further.
"""
super(MetaEmptyLinkedDataWrapper, self).__init__([])
self.__doc_id = doc_id
self.__meta_data = meta_data

@property
def RelatedDocID(self):
return self.__doc_id

@property
def MetaData(self):
return self.__meta_data
6 changes: 4 additions & 2 deletions arekit/contrib/utils/data/contents/opinions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from arekit.common.data.input.providers.const import IDLE_MODE
from arekit.common.data.input.providers.contents import ContentsProvider
from arekit.common.linkage.base import LinkedDataWrapper
from arekit.common.linkage.text_opinions import TextOpinionsLinkage
from arekit.common.pipeline.base import BasePipeline
from arekit.common.text_opinions.base import TextOpinion
Expand Down Expand Up @@ -30,6 +31,7 @@ def __assign_ids(self, linkage):
def from_doc_ids(self, doc_ids, idle_mode=False):
self.__current_id = 0
for linkage in self.__pipeline.run(doc_ids, params_dict={IDLE_MODE: idle_mode}):
assert(isinstance(linkage, TextOpinionsLinkage))
self.__assign_ids(linkage)
assert(isinstance(linkage, LinkedDataWrapper))
if isinstance(linkage, TextOpinionsLinkage):
self.__assign_ids(linkage)
yield linkage
4 changes: 4 additions & 0 deletions arekit/contrib/utils/pipelines/text_opinion/extraction.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from arekit.common.linkage.meta import MetaEmptyLinkedDataWrapper
from arekit.common.linkage.text_opinions import TextOpinionsLinkage
from arekit.common.docs.parsed.base import ParsedDocument
from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider
Expand Down Expand Up @@ -51,6 +52,9 @@ def __to_id(text_opinion):
text_opinion_linkage.set_tag(service)
yield text_opinion_linkage

# This is the case to consider the end of the document.
yield MetaEmptyLinkedDataWrapper(doc_id=parsed_doc.RelatedDocID)


def text_opinion_extraction_pipeline(text_parser, get_doc_by_id_func, annotators, entity_index_func,
text_opinion_filters=None):
Expand Down

0 comments on commit eb52e3c

Please sign in to comment.