-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparser.py
53 lines (40 loc) · 1.89 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from arekit.common.docs.objects_parser import SentenceObjectsParserPipelineItem
from arekit.common.pipeline.context import PipelineContext
from arekit.common.text.partitioning.str import StringPartitioning
from arekit.common.text.partitioning.terms import TermsPartitioning
from arekit.contrib.source.brat.sentence import BratSentence
class BratTextEntitiesParser(SentenceObjectsParserPipelineItem):
KEY = "sentence"
################################
# NOTE: Supported partitionings.
################################
# By default, BRAT annotation proposes to adopt entities annotation
# based on string input, which means that entity ends described as
# `char-ind-begin` and `char-ind-end`. However, the latter could be
# expanded to list of terms, which means that we deal with `ind-begin`
# and `ind-end` list indices.
__supported_partitionings = {
"string": StringPartitioning(),
"terms": TermsPartitioning()
}
def __init__(self, partitioning="string"):
assert(isinstance(partitioning, str))
super(BratTextEntitiesParser, self).__init__(self.__supported_partitionings[partitioning])
# region protected methods
def _get_text(self, pipeline_ctx):
sentence = self.__get_sentence(pipeline_ctx)
return sentence.Text
def _get_parts_provider_func(self, input_data, pipeline_ctx):
sentence = self.__get_sentence(pipeline_ctx)
return self.__iter_subs_values_with_bounds(sentence)
# endregion
# region private methods
def __get_sentence(self, pipeline_ctx):
assert(isinstance(pipeline_ctx, PipelineContext))
assert(self.KEY in pipeline_ctx)
return pipeline_ctx.provide(self.KEY)
@staticmethod
def __iter_subs_values_with_bounds(sentence):
assert(isinstance(sentence, BratSentence))
return sentence.iter_entity_with_local_bounds()
# endregion