Skip to content

Commit

Permalink
Refactoring related to #93 which also lead to #122
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed May 27, 2021
1 parent 07ee6aa commit 0f23b3d
Show file tree
Hide file tree
Showing 39 changed files with 170 additions and 169 deletions.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
logging.basicConfig(level=logging.INFO)


class BaseNeutralAnnotator(object):
class BaseAnnotator(object):
"""
Performs neutral annotation for different data_type.
Performs annotation for a particular data_type
using OpinOps and DocOps API.
"""

def __init__(self):
Expand Down Expand Up @@ -40,26 +41,26 @@ def _DocOps(self):

# region private methods

def __iter_neutral_collections(self, data_type, filter_func):
def __iter_annotated_collections(self, data_type, filter_func):
docs_to_annot_list = filter(filter_func,
self._DocOps.iter_doc_ids_to_neutrally_annotate())
self._DocOps.iter_doc_ids_to_annotate())

if len(docs_to_annot_list) == 0:
logger.info("[{}]: OK!".format(data_type))
logger.info("[{}]: Nothing to annotate".format(data_type))
return

logged_parsed_news_iter = progress_bar_iter(
iterable=self._DocOps.iter_parsed_news(docs_to_annot_list),
desc="Creating neutral-examples [{}]".format(data_type))
desc="Annotating parsed news [{}]".format(data_type))

for parsed_news in logged_parsed_news_iter:
assert(isinstance(parsed_news, ParsedNews))
yield parsed_news.RelatedNewsID, \
self._create_collection_core(parsed_news=parsed_news, data_type=data_type)
self._annot_collection_core(parsed_news=parsed_news, data_type=data_type)

# endregion

def _create_collection_core(self, parsed_news, data_type):
def _annot_collection_core(self, parsed_news, data_type):
raise NotImplementedError

# region public methods
Expand All @@ -72,12 +73,12 @@ def initialize(self, opin_ops, doc_ops):

def serialize_missed_collections(self, data_type):

filter_func = lambda doc_id: self._OpinOps.try_read_neutrally_annotated_opinion_collection(
filter_func = lambda doc_id: self._OpinOps.try_read_annotated_opinion_collection(
doc_id=doc_id, data_type=data_type) is None

for doc_id, collection in self.__iter_neutral_collections(data_type, filter_func):
self._OpinOps.save_neutrally_annotated_opinion_collection(collection=collection,
doc_id=doc_id,
data_type=data_type)
for doc_id, collection in self.__iter_annotated_collections(data_type, filter_func):
self._OpinOps.save_annotated_opinion_collection(collection=collection,
doc_id=doc_id,
data_type=data_type)

# endregion
4 changes: 4 additions & 0 deletions common/experiment/annot/base_annot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class BaseAnnotationAlgorithm(object):

def iter_opinions(self, parsed_news, entities_collection, existed_opinions=None):
pass
Original file line number Diff line number Diff line change
@@ -1,31 +1,33 @@
from arekit.common.entities.base import Entity
from arekit.common.entities.collection import EntityCollection
from arekit.common.experiment.neutral.algo.base import BaseNeutralAnnotationAlgorithm
from arekit.common.labels.base import NeutralLabel
from arekit.common.experiment.annot.base_annot import BaseAnnotationAlgorithm
from arekit.common.labels.base import NoLabel, Label
from arekit.common.news.parsed.base import ParsedNews
from arekit.common.opinions.base import Opinion
from arekit.common.dataset.text_opinions.enums import DistanceType
from arekit.common.dataset.text_opinions.helper import TextOpinionHelper


class DefaultNeutralAnnotationAlgorithm(BaseNeutralAnnotationAlgorithm):
class DefaultSingleLabelAnnotationAlgorithm(BaseAnnotationAlgorithm):
"""
Neutral annotation algorithm which assumes to compose pairs
within a sentence which are not a part of sentiment.
"""

def __init__(self, dist_in_terms_bound, dist_in_sents=0, ignored_entity_values=None):
def __init__(self, dist_in_terms_bound, label_instance, dist_in_sents=0, ignored_entity_values=None):
"""
dist_in_terms_bound: int
max allowed distance in term (less than passed value)
"""
assert(isinstance(ignored_entity_values, list) or ignored_entity_values is None)
assert(isinstance(dist_in_terms_bound, int) or dist_in_terms_bound is None)
assert(isinstance(label_instance, Label))
assert(isinstance(dist_in_sents, int))

self.__ignored_entity_values = [] if ignored_entity_values is None else ignored_entity_values
self.__dist_in_terms_bound = dist_in_terms_bound
self.__dist_in_sents = dist_in_sents
self.__label_instance = label_instance

# region private methods

Expand All @@ -39,7 +41,7 @@ def __is_ignored_entity_value(self, entity_value):
assert(isinstance(entity_value, unicode))
return entity_value in self.__ignored_entity_values

def __iter_opinions_between_entties(self, relevant_pairs, entities_collection):
def __iter_opinions_between_entities(self, relevant_pairs, entities_collection):
assert(isinstance(entities_collection, EntityCollection))

for e1 in entities_collection:
Expand All @@ -54,11 +56,11 @@ def __iter_opinions_between_entties(self, relevant_pairs, entities_collection):

opinion = Opinion(source_value=e1.Value,
target_value=e2.Value,
sentiment=NeutralLabel())
sentiment=self.__label_instance)

yield opinion

def __try_create_pair_key(self, parsed_news, e1, e2, sentiment_opinions):
def __try_create_pair_key(self, parsed_news, e1, e2, existed_opinions):
assert(isinstance(e1, Entity))
assert(isinstance(e2, Entity))

Expand All @@ -84,18 +86,18 @@ def __try_create_pair_key(self, parsed_news, e1, e2, sentiment_opinions):
if self.__dist_in_terms_bound is not None and t_dist > self.__dist_in_terms_bound:
return

if sentiment_opinions is not None:
if existed_opinions is not None:
o = Opinion(source_value=e1.Value,
target_value=e2.Value,
sentiment=NeutralLabel())
if sentiment_opinions.has_synonymous_opinion(opinion=o):
sentiment=self.__label_instance)
if existed_opinions.has_synonymous_opinion(opinion=o):
return

return self.__create_key_by_entity_pair(e1=e1, e2=e2)

# endregion

def iter_neutral_opinions(self, parsed_news, entities_collection, sentiment_opinions=None):
def iter_opinions(self, parsed_news, entities_collection, existed_opinions=None):
assert(isinstance(parsed_news, ParsedNews))
assert(isinstance(entities_collection, EntityCollection))

Expand All @@ -109,12 +111,12 @@ def iter_neutral_opinions(self, parsed_news, entities_collection, sentiment_opin

key = self.__try_create_pair_key(parsed_news=parsed_news,
e1=e1, e2=e2,
sentiment_opinions=sentiment_opinions)
existed_opinions=existed_opinions)

if key is None:
continue

relevant_pairs[key] = 0

return self.__iter_opinions_between_entties(relevant_pairs=relevant_pairs,
entities_collection=entities_collection)
return self.__iter_opinions_between_entities(relevant_pairs=relevant_pairs,
entities_collection=entities_collection)
20 changes: 20 additions & 0 deletions common/experiment/annotate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from arekit.common.experiment.annot.base import BaseAnnotator
from arekit.common.experiment.formats.documents import DocumentOperations
from arekit.common.experiment.formats.opinions import OpinionOperations


def do_annotation(logger, annotator, opin_ops, doc_ops):
""" Performing annotation both using annotator and algorithm.
"""
assert(isinstance(doc_ops, DocumentOperations))
assert(isinstance(opin_ops, OpinionOperations))
assert(isinstance(annotator, BaseAnnotator))

# Initializing annotator
logger.info("Initializing annotator ...")
annotator.initialize(opin_ops=opin_ops, doc_ops=doc_ops)

# Perform annotation
logger.info("Perform annotation ...")
for data_type in doc_ops.DataFolding.iter_supported_data_types():
annotator.serialize_missed_collections(data_type=data_type)
20 changes: 10 additions & 10 deletions common/experiment/data/serializing.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
from arekit.common.experiment.annot.base import BaseAnnotator
from arekit.common.experiment.data.base import DataIO
from arekit.common.experiment.neutral.annot.base import BaseNeutralAnnotator
from arekit.common.labels.scaler import BaseLabelScaler


class SerializationData(DataIO):
""" Data, that is necessary for models training stage.
"""

def __init__(self, label_scaler, neutral_annot, stemmer):
def __init__(self, label_scaler, annot, stemmer):
assert(isinstance(label_scaler, BaseLabelScaler))
assert(isinstance(neutral_annot, BaseNeutralAnnotator))
assert(isinstance(annot, BaseAnnotator))
super(SerializationData, self).__init__(stemmer=stemmer)

self.__label_scaler = label_scaler

if self.LabelsCount != neutral_annot.LabelsCount:
raise Exception(u"Label scaler and neutral annotation are incompatible due to differs in labels count!")
if self.LabelsCount != annot.LabelsCount:
raise Exception(u"Label scaler and annotator are incompatible due to differs in labels count!")

self.__neutral_annot = neutral_annot
self.__annot = annot

@property
def LabelsScaler(self):
Expand All @@ -31,12 +31,12 @@ def LabelsCount(self):
return self.__label_scaler.LabelsCount

@property
def NeutralAnnotator(self):
""" Provides an instance of neutral annotator that might be utlized
for neutral attitudes labeling for a specific set of documents,
def Annotator(self):
""" Provides an instance of annotator that might be utilized
for attitudes labeling within a specific set of documents,
declared in a particular experiment (see OpinionOperations).
"""
return self.__neutral_annot
return self.__annot

@property
def StringEntityFormatter(self):
Expand Down
8 changes: 4 additions & 4 deletions common/experiment/formats/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ def DataFolding(self):
"""
return self.__folding

def iter_doc_ids_to_neutrally_annotate(self):
""" provides set of documents that utilized by neutral annotator algorithm in order to
provide the related labeling of neutral attitudes in it.
By default we consider an empty set, so there is no need to utilize neutral annotator.
def iter_doc_ids_to_annotate(self):
""" provides set of documents that utilized by annotator algorithm in order to
provide the related labeling of annotated attitudes in it.
By default, we consider an empty set, so there is no need to utilize annotator.
"""
raise NotImplementedError()

Expand Down
6 changes: 3 additions & 3 deletions common/experiment/formats/opinions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ def __init__(self):

# region annotation

def try_read_neutrally_annotated_opinion_collection(self, doc_id, data_type):
""" data_type denotes a set of neutral opinions, where in case of 'train' these are
def try_read_annotated_opinion_collection(self, doc_id, data_type):
""" data_type denotes a set of unlabeled opinions, where in case of 'train' these are
opinions that were ADDITIONALLY found to sentiment, while for 'test' these are
all the opinions that could be found in document.
"""
raise NotImplementedError()

def save_neutrally_annotated_opinion_collection(self, collection, doc_id, data_type):
def save_annotated_opinion_collection(self, collection, doc_id, data_type):
raise NotImplementedError()

# endregion
Expand Down
15 changes: 9 additions & 6 deletions common/experiment/io_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ def _get_filepath(out_dir, template, prefix):
assert(isinstance(prefix, unicode))
return join(out_dir, BaseIOUtils.__generate_tsv_archive_filename(template=template, prefix=prefix))

def _get_neutral_annot_name(self):
def _get_annotator_name(self):
""" We use custom implementation as it allows to
be independent of NeutralAnnotator instance.
"""
return u"neut_annot_{labels_count}l".format(labels_count=self._experiment.DataIO.LabelsCount)
return u"annot_{labels_count}l".format(labels_count=self._experiment.DataIO.LabelsCount)

# endregion

Expand All @@ -72,15 +72,18 @@ def get_input_sample_filepath(self, data_type):
template=template,
prefix=BaseSampleFormatter.formatter_type_log_name())

def create_neutral_opinion_collection_filepath(self, doc_id, data_type):
def create_annotated_collection_filepath(self, doc_id, data_type):
assert(isinstance(doc_id, int))
assert(isinstance(data_type, DataType))

annot_dir = self.__get_neutral_annotation_dir()
annot_dir = self.__get_annotator_dir()

if annot_dir is None:
raise NotImplementedError("Neutral root was not provided!")

# TODO. This should not depends on the neut.
# TODO. This should not depends on the neut.
# TODO. This should not depends on the neut.
filename = u"art{doc_id}.neut.{d_type}.txt".format(doc_id=doc_id,
d_type=data_type.name)

Expand All @@ -97,8 +100,8 @@ def create_result_opinion_collection_filepath(self, data_type, doc_id, epoch_ind
def __generate_tsv_archive_filename(template, prefix):
return u"{prefix}-{template}.tsv.gz".format(prefix=prefix, template=template)

def __get_neutral_annotation_dir(self):
def __get_annotator_dir(self):
return join_dir_with_subfolder_name(dir=self.get_target_dir(),
subfolder_name=self._get_neutral_annot_name())
subfolder_name=self._get_annotator_name())

# endregion
Empty file.
5 changes: 0 additions & 5 deletions common/experiment/neutral/algo/base.py

This file was deleted.

Empty file.
20 changes: 0 additions & 20 deletions common/experiment/neutral/run.py

This file was deleted.

2 changes: 1 addition & 1 deletion common/labels/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ def to_class_str(self):
return self.__class__.__name__


class NeutralLabel(Label):
class NoLabel(Label):
pass
10 changes: 5 additions & 5 deletions contrib/bert/run_serializer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from arekit.common.experiment.annotate import do_annotation
from arekit.common.experiment.data_type import DataType
from arekit.common.experiment.engine.cv_based import ExperimentEngine
from arekit.common.experiment.formats.base import BaseExperiment
from arekit.common.experiment.input.encoder import BaseInputEncoder
from arekit.common.experiment.input.formatters.opinion import BaseOpinionsFormatter
from arekit.common.experiment.input.providers.opinions import OpinionProvider
from arekit.common.experiment.neutral.run import perform_neutral_annotation
from arekit.common.labels.str_fmt import StringLabelsFormatter
from arekit.contrib.bert.samplers.factory import create_bert_sample_formatter

Expand Down Expand Up @@ -77,9 +77,9 @@ def _handle_iteration(self, it_index):
self.__handle_iteration(data_type)

def _before_running(self):
perform_neutral_annotation(neutral_annotator=self._experiment.DataIO.NeutralAnnotator,
opin_ops=self._experiment.OpinionOperations,
doc_ops=self._experiment.DocumentOperations,
logger=self._logger)
do_annotation(annotator=self._experiment.DataIO.Annotator,
opin_ops=self._experiment.OpinionOperations,
doc_ops=self._experiment.DocumentOperations,
logger=self._logger)

# endregion
6 changes: 3 additions & 3 deletions contrib/bert/test/labels.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import OrderedDict

from arekit.common.labels.base import Label, NeutralLabel
from arekit.common.labels.base import Label, NoLabel
from arekit.common.labels.scaler import BaseLabelScaler


Expand All @@ -16,11 +16,11 @@ class TestThreeLabelScaler(BaseLabelScaler):

def __init__(self):

uint_labels = [(NeutralLabel(), 0),
uint_labels = [(NoLabel(), 0),
(TestPositiveLabel(), 1),
(TestNegativeLabel(), 2)]

int_labels = [(NeutralLabel(), 0),
int_labels = [(NoLabel(), 0),
(TestPositiveLabel(), 1),
(TestNegativeLabel(), -1)]

Expand Down
Loading

0 comments on commit 0f23b3d

Please sign in to comment.