Skip to content

Commit

Permalink
#507 added task and labeling.
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Jul 29, 2023
1 parent 4cb1fba commit 31e0c75
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 2 deletions.
6 changes: 5 additions & 1 deletion arekit/contrib/source/nerel/folding/fixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from arekit.common.folding.fixed import FixedFolding


def create_fixed_folding(train_filenames, dev_filenames, test_filenames):
def create_fixed_folding(train_filenames, dev_filenames, test_filenames, limit=None):
""" Create fixed data-folding based on the predefined list of filenames,
written in file.
"""
Expand All @@ -18,6 +18,10 @@ def create_fixed_folding(train_filenames, dev_filenames, test_filenames):
for doc_id, filename in filenames_by_ids.items():
ids_by_filenames[filename] = doc_id

train_filenames = train_filenames if limit is None else train_filenames[:limit]
test_filenames = test_filenames if limit is None else test_filenames[:limit]
dev_filenames = dev_filenames if limit is None else dev_filenames[:limit]

fixed_folding = FixedFolding.from_parts({
DataType.Train: [ids_by_filenames[filename] for filename in train_filenames],
DataType.Test: [ids_by_filenames[filename] for filename in test_filenames],
Expand Down
2 changes: 1 addition & 1 deletion arekit/contrib/source/nerel/io_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def map_doc_to_fold_type(version=DEFAULT_VERSION):
return d2f

@staticmethod
def read_dataset_split(version=DEFAULT_VERSION):
def read_dataset_split(version=DEFAULT_VERSION, docs_limit=None):
f2d = {}
for filename, split_type in NerelIOUtils.__iter_filename_and_splittype(version):
if split_type not in f2d:
Expand Down
53 changes: 53 additions & 0 deletions arekit/contrib/source/nerel/labels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from arekit.common.labels.base import Label


class OpinionBelongsTo(Label):
pass


class OpinionRelatesTo(Label):
pass


class NegEffectFrom(Label):
pass


class NegStateFrom(Label):
pass


class PosEffectFrom(Label):
pass


class PosAuthorFrom(Label):
pass


class NegAuthorFrom(Label):
pass


class PosStateFrom(Label):
pass


class NegativeTo(Label):
pass


class PositiveTo(Label):
pass


class AlternativeName(Label):
pass


class StateBelongsTo(Label):
pass


class OriginsFrom(Label):
pass
Empty file.
28 changes: 28 additions & 0 deletions arekit/contrib/utils/pipelines/sources/nerel/doc_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from arekit.common.experiment.api.ops_doc import DocumentOperations
from arekit.contrib.source.nerel.io_utils import NerelVersions
from arekit.contrib.source.nerel.reader import NerelDocReader


class NERELDocOperation(DocumentOperations):
""" Document reader for the collection of the RuSentNE competition 2023.
For more details please follow the following repository:
github: https://github.com/dialogue-evaluation/RuSentNE-evaluation
"""

def __init__(self, filename_by_id, version):
""" filename_ids: dict
Dictionary of {id: filename}, where
- id: int
- filename: str
version: SentiNerelVersions
Specify the appropriate version of teh SentiNEREL collection.
"""
assert(isinstance(filename_by_id, dict))
assert(isinstance(version, NerelVersions))
super(NERELDocOperation, self).__init__()
self.__filename_by_id = filename_by_id
self.__version = version
self.__doc_reader = NerelDocReader(version)

def by_id(self, doc_id):
return self.__doc_reader.read_document(doc_id=doc_id, filename=self.__filename_by_id[doc_id])
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from arekit.common.experiment.api.ops_doc import DocumentOperations
from arekit.common.experiment.data_type import DataType
from arekit.contrib.source.nerel.io_utils import NerelIOUtils, NerelVersions
from arekit.contrib.utils.pipelines.sources.nerel.doc_ops import NERELDocOperation
from arekit.contrib.utils.pipelines.sources.nerel.labels_fmt import NerelAnyLabelFormatter
from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
from arekit.contrib.utils.pipelines.text_opinion.extraction import text_opinion_extraction_pipeline
from arekit.contrib.utils.pipelines.text_opinion.filters.distance_based import DistanceLimitedTextOpinionFilter
from arekit.contrib.utils.pipelines.text_opinion.filters.entity_based import EntityBasedTextOpinionFilter


def create_text_relation_extraction_pipeline(sentinerel_version,
text_parser,
label_formatter=NerelAnyLabelFormatter(),
terms_per_context=50,
doc_ops=None,
docs_limit=None,
entity_filter=None):
assert(isinstance(sentinerel_version, NerelVersions))
assert(isinstance(doc_ops, DocumentOperations) or doc_ops is None)

data_folding = None

if doc_ops is None:
# Default Initialization.
filenames_by_ids, data_folding = NerelIOUtils.read_dataset_split(version=sentinerel_version,
docs_limit=docs_limit)
doc_ops = NERELDocOperation(filename_by_id=filenames_by_ids,
version=sentinerel_version)

text_opinion_filters = [
EntityBasedTextOpinionFilter(entity_filter=entity_filter),
DistanceLimitedTextOpinionFilter(terms_per_context)
]

predefined_annot = PredefinedTextOpinionAnnotator(doc_ops, label_formatter)

pipelines = {
DataType.Train: text_opinion_extraction_pipeline(text_parser=text_parser,
get_doc_by_id_func=doc_ops.by_id,
annotators=[predefined_annot],
text_opinion_filters=text_opinion_filters),
DataType.Test: text_opinion_extraction_pipeline(text_parser=text_parser,
get_doc_by_id_func=doc_ops.by_id,
annotators=[predefined_annot],
text_opinion_filters=text_opinion_filters),
DataType.Dev: text_opinion_extraction_pipeline(text_parser=text_parser,
get_doc_by_id_func=doc_ops.by_id,
annotators=[predefined_annot],
text_opinion_filters=text_opinion_filters),
}

# In the case when we setup a default data-folding.
# There is a need to provide it, due to the needs in further.
if data_folding is not None:
return pipelines, data_folding

return pipelines
25 changes: 25 additions & 0 deletions arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from arekit.common.labels.str_fmt import StringLabelsFormatter
from arekit.contrib.source.nerel import labels


class NerelAnyLabelFormatter(StringLabelsFormatter):

def __init__(self):

stol = {
"OPINION_BELONGS_TO": labels.OpinionBelongsTo,
"OPINION_RELATES_TO": labels.OpinionRelatesTo,
"NEG_EFFECT_FROM": labels.NegEffectFrom,
"POS_EFFECT_FROM": labels.PosEffectFrom,
"NEG_STATE_FROM": labels.NegStateFrom,
"POS_STATE_FROM": labels.PosStateFrom,
"NEGATIVE_TO": labels.NegativeTo,
"POSITIVE_TO": labels.PositiveTo,
"STATE_BELONGS_TO": labels.StateBelongsTo,
"POS_AUTHOR_FROM": labels.PosAuthorFrom,
"NEG_AUTHOR_FROM": labels.NegAuthorFrom,
"ALTERNATIVE_NAME": labels.AlternativeName,
"ORIGINS_FROM": labels.OriginsFrom
}

super(NerelAnyLabelFormatter, self).__init__(stol=stol)

0 comments on commit 31e0c75

Please sign in to comment.