-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#539 done. Now pipelines way more flexible. Source could be customized.
#535 related
- Loading branch information
Showing
31 changed files
with
212 additions
and
146 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,24 +1,61 @@ | ||
from arekit.common.docs.base import Document | ||
from arekit.common.docs.parsed.base import ParsedDocument | ||
from arekit.common.pipeline.base import BasePipeline | ||
from arekit.common.pipeline.batching import BatchingPipeline | ||
from arekit.common.pipeline.context import PipelineContext | ||
from arekit.common.text.parser import BaseTextParser | ||
from arekit.common.pipeline.utils import BatchIterator | ||
from arekit.common.text.parsed import BaseParsedText | ||
|
||
|
||
class DocumentParser(object): | ||
class DocumentParsers(object): | ||
|
||
@staticmethod | ||
def __get_sent(doc, sent_ind): | ||
return doc.get_sentence(sent_ind) | ||
def parse(doc, pipeline_items, parent_ppl_ctx=None): | ||
""" This document parser is based on single text parts (sentences) | ||
that passes sequentially through the pipeline of transformations. | ||
""" | ||
assert(isinstance(doc, Document)) | ||
assert(isinstance(pipeline_items, list)) | ||
assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None) | ||
|
||
pipeline = BasePipeline(pipeline_items) | ||
|
||
parsed_sentences = [] | ||
for sent_ind in range(doc.SentencesCount): | ||
|
||
# Composing the context from a single sentence. | ||
ctx = PipelineContext({"input": doc.get_sentence(sent_ind)}, parent_ctx=parent_ppl_ctx) | ||
|
||
# Apply all the operations. | ||
pipeline.run(ctx) | ||
|
||
# Collecting the result. | ||
parsed_sentences.append(BaseParsedText(terms=ctx.provide("result"))) | ||
|
||
return ParsedDocument(doc_id=doc.ID, parsed_sentences=parsed_sentences) | ||
|
||
@staticmethod | ||
def parse(doc, text_parser, parent_ppl_ctx=None): | ||
def parse_batch(doc, pipeline_items, batch_size, parent_ppl_ctx=None): | ||
""" This document parser is based on batch of sentences. | ||
""" | ||
assert(isinstance(batch_size, int) and batch_size > 0) | ||
assert(isinstance(doc, Document)) | ||
assert(isinstance(text_parser, BaseTextParser)) | ||
assert(isinstance(pipeline_items, list)) | ||
assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None) | ||
|
||
parsed_sentences = [text_parser.run(params_dict={"input": DocumentParser.__get_sent(doc, sent_ind)}, | ||
parent_ctx=parent_ppl_ctx) | ||
for sent_ind in range(doc.SentencesCount)] | ||
pipeline = BatchingPipeline(pipeline_items) | ||
|
||
parsed_sentences = [] | ||
for batch in BatchIterator(lst=list(range(doc.SentencesCount)), batch_size=batch_size): | ||
|
||
# Composing the context from a single sentence. | ||
ctx = PipelineContext({"input": [doc.get_sentence(s_ind) for s_ind in batch]}, | ||
parent_ctx=parent_ppl_ctx) | ||
|
||
# Apply all the operations. | ||
pipeline.run(ctx) | ||
|
||
# Collecting the result. | ||
parsed_sentences += [BaseParsedText(terms=result) for result in ctx.provide("result")] | ||
|
||
return ParsedDocument(doc_id=doc.ID, | ||
parsed_sentences=parsed_sentences) | ||
return ParsedDocument(doc_id=doc.ID, parsed_sentences=parsed_sentences) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from arekit.common.pipeline.base import BasePipeline | ||
from arekit.common.pipeline.context import PipelineContext | ||
from arekit.common.pipeline.items.base import BasePipelineItem | ||
|
||
|
||
class BatchingPipeline(BasePipeline): | ||
|
||
def run(self, pipeline_ctx): | ||
assert(isinstance(pipeline_ctx, PipelineContext)) | ||
|
||
for item in filter(lambda itm: itm is not None, self._pipeline): | ||
assert (isinstance(item, BasePipelineItem)) | ||
|
||
# Handle the content of the batch or batch itself. | ||
if item.SupportBatching: | ||
handled_batch = item.get_source(pipeline_ctx) | ||
else: | ||
content = item.get_source(pipeline_ctx, call_func=False) | ||
handled_batch = [item._src_func(i) if item._src_func is not None else i for i in content] | ||
|
||
# At present, each batch represent a list of contents. | ||
assert(isinstance(handled_batch, list)) | ||
|
||
batch_result = [] | ||
input_data_iter = [handled_batch] if item.SupportBatching else handled_batch | ||
for input_data in input_data_iter: | ||
item_result = item.apply(input_data=input_data, pipeline_ctx=pipeline_ctx) | ||
batch_result.append(item_result) | ||
|
||
pipeline_ctx.update(param=item.ResultKey, value=batch_result, is_new_key=False) | ||
|
||
return pipeline_ctx |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
class BatchIterator: | ||
|
||
def __init__(self, lst, batch_size): | ||
assert(isinstance(lst, list)) | ||
assert(isinstance(batch_size, int) and batch_size > 0) | ||
self.__lst = lst | ||
self.__index = 0 | ||
self.__batch_size = batch_size | ||
|
||
def __iter__(self): | ||
return self | ||
|
||
def __next__(self): | ||
if self.__index < len(self.__lst): | ||
batch = self.__lst[self.__index:self.__index + self.__batch_size] | ||
self.__index += 2 | ||
return batch | ||
else: | ||
raise StopIteration |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.