Merge pull request #126 from nicolay-r/0.21.0-rc

0.21.0 rc
nicolay-r · Jun 14, 2021 · b3b098b · b3b098b
2 parents 615af0f + aaa8fd3
commit b3b098b
Show file tree

Hide file tree

Showing 157 changed files with 791 additions and 742 deletions.
diff --git a/README.md b/README.md
@@ -5,12 +5,10 @@
 </p>
 
 **AREkit** (Attitude and Relation Extraction Toolkit) -- is a python toolkit, devoted to 
-**sentiment attitude extraction** task. Please proceed to [[ABOUT.md]](ABOUT.md) for more details.
+document level Attitude and Relation Extraction for text objects with objects-synonymy support.
 
 ## Dependencies
 
-List of the toolset dependencies is as follows:
-
 * python == 2.7 (No doubts it will be updated to 3.4+)
 
 * pymystem3 == 0.1.9
@@ -30,30 +28,9 @@ Then install dependencies as follows:
 pip install -r dependencies.txt
 ```
 
-## References
-
-### Applications
-
-> TODO. Provide the list of all the applications, which are independed from the particular conferences.
-
-### Papers
-
-* Attention-Based Neural Networks for Sentiment Attitude Extraction using Distant Supervision 
-[[ACM-DOI]](https://doi.org/10.1145/3405962.3405985)
-    * Rusnachenko Nicolay, Loukachevitch Natalia
-    * WIMS-2020
-* Studying Attention Models in Sentiment Attitude Extraction Task 
-[[Springer]](https://doi.org/10.1007/978-3-030-51310-8_15) /
-[[arXiv:2006.11605]](https://arxiv.org/abs/2006.11605)
-    * Rusnachenko Nicolay, Loukachevitch Natalia
-    * NLDB-2020
-* Distant Supervision for Sentiment Attitude Extraction
-[[paper-ranlp-proceeding]](http://lml.bas.bg/ranlp2019/proceedings-ranlp-2019.pdf),
-[[poster]](docs/ranlp_2019_poster_portrait.pdf)
-    * Rusnachenko Nikolay, Loukachevitch Natalia, Tutubalina Elena
-    * RANLP-2019
-* Neural Network Approach for Extracting Aggregated Opinions from Analytical Articles 
-[[paper]](https://link.springer.com/chapter/10.1007/978-3-030-23584-0_10)
-[[code]](https://github.com/nicolay-r/sentiment-pcnn/tree/ccis-2019)
-    * Nicolay Rusnachenko, Natalia Loukachevitch 
-    * TSD-2018
+## Framework Applications
+
+* Neural Networks for attitude extraction 
+[[code]](https://github.com/nicolay-r/neural-networks-for-attitude-extraction)
+* Input Formatter for BERT-based models 
+[[code]](https://github.com/nicolay-r/bert-utils-for-attitude-extraction)
diff --git a/common/evaluation/evaluators/base.py b/common/evaluation/evaluators/base.py
@@ -81,7 +81,17 @@ def _create_eval_result(self):
 
     # region protected methods
 
-    def _calc_diff(self, etalon_opins, test_opins):
+    def _check_is_supported(self, label, is_label_supported):
+        if label is None:
+            return True
+
+        if not is_label_supported(label):
+            raise Exception(u"Label \"{label}\" is not supported by {e}".format(
+                label=label_to_str(label),
+                e=type(self).__name__))
+
+    def _calc_diff(self, etalon_opins, test_opins, is_label_supported):
+        assert(callable(is_label_supported))
 
         it = self.__iter_diff_core(etalon_opins=etalon_opins,
                                    test_opins=test_opins)
@@ -91,6 +101,9 @@ def _calc_diff(self, etalon_opins, test_opins):
         for args in it:
             opin, etalon_label, result_label = args
 
+            self._check_is_supported(label=etalon_label, is_label_supported=is_label_supported)
+            self._check_is_supported(label=result_label, is_label_supported=is_label_supported)
+
             row = [opin.SourceValue.encode('utf-8'),
                    opin.TargetValue.encode('utf-8'),
                    None if etalon_label is None else label_to_str(etalon_label),
@@ -119,7 +132,8 @@ def evaluate(self, cmp_pairs):
         for cmp_pair in cmp_pairs:
             assert(isinstance(cmp_pair, OpinionCollectionsToCompare))
             cmp_table = self._calc_diff(etalon_opins=cmp_pair.EtalonOpinionCollection,
-                                        test_opins=cmp_pair.TestOpinionCollection)
+                                        test_opins=cmp_pair.TestOpinionCollection,
+                                        is_label_supported=result.is_label_supported)
 
             result.reg_doc(cmp_pair=cmp_pair, cmp_table=cmp_table)
 

diff --git a/common/evaluation/results/base.py b/common/evaluation/results/base.py
@@ -6,9 +6,11 @@
 
 class BaseEvalResult(object):
 
-    def __init__(self):
+    def __init__(self, supported_labels):
+        assert(isinstance(supported_labels, set))
         self._cmp_tables = {}
         self._total_result = OrderedDict()
+        self.__supported_labels = supported_labels
 
     # region properties
 
@@ -25,6 +27,9 @@ def calculate(self):
 
     # endregion
 
+    def is_label_supported(self, label):
+        return label in self.__supported_labels
+
     def get_result_by_metric(self, metric_name):
         assert(isinstance(metric_name, unicode))
         return self._total_result[metric_name]
@@ -34,7 +39,7 @@ def iter_total_by_param_results(self):
         return self._total_result.iteritems()
 
     def iter_dataframe_cmp_tables(self):
-        yield self._cmp_tables.iteritems()
+        return self._cmp_tables.iteritems()
 
     def reg_doc(self, cmp_pair, cmp_table):
         """ Registering cmp_table.

diff --git a/common/experiment/neutral/__init__.py → common/experiment/annot/__init__.py b/common/experiment/neutral/__init__.py → common/experiment/annot/__init__.py
diff --git a/common/experiment/annot/base.py b/common/experiment/annot/base.py
@@ -0,0 +1,72 @@
+import logging
+
+from arekit.common.experiment.formats.documents import DocumentOperations
+from arekit.common.experiment.formats.opinions import OpinionOperations
+from arekit.common.news.parsed.base import ParsedNews
+from arekit.common.utils import progress_bar_iter
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+class BaseAnnotator(object):
+    """
+    Performs annotation for a particular data_type
+    using OpinOps and DocOps API.
+    """
+
+    def __init__(self):
+        logger.info("Init annotator: [{}]".format(self.__class__))
+
+    @property
+    def LabelsCount(self):
+        raise NotImplementedError()
+
+    # region private methods
+
+    def __iter_annotated_collections(self, data_type, filter_func, doc_ops, opin_ops):
+        assert(isinstance(doc_ops, DocumentOperations))
+        assert(isinstance(opin_ops, OpinionOperations))
+
+        docs_to_annot_list = filter(filter_func,
+                                    doc_ops.iter_doc_ids_to_annotate())
+
+        if len(docs_to_annot_list) == 0:
+            logger.info("[{}]: Nothing to annotate".format(data_type))
+            return
+
+        logged_parsed_news_iter = progress_bar_iter(
+            iterable=doc_ops.iter_parsed_news(docs_to_annot_list),
+            desc="Annotating parsed news [{}]".format(data_type))
+
+        for parsed_news in logged_parsed_news_iter:
+            assert(isinstance(parsed_news, ParsedNews))
+            yield parsed_news.RelatedNewsID, \
+                  self._annot_collection_core(parsed_news=parsed_news, data_type=data_type,
+                                              doc_ops=doc_ops, opin_ops=opin_ops)
+
+    # endregion
+
+    def _annot_collection_core(self, parsed_news, data_type, doc_ops, opin_ops):
+        raise NotImplementedError
+
+    # region public methods
+
+    def serialize_missed_collections(self, data_type, doc_ops, opin_ops):
+        assert(isinstance(opin_ops, OpinionOperations))
+
+        filter_func = lambda doc_id: opin_ops.try_read_annotated_opinion_collection(
+            doc_id=doc_id, data_type=data_type) is None
+
+        annot_it = self.__iter_annotated_collections(
+            data_type,
+            filter_func,
+            doc_ops=doc_ops,
+            opin_ops=opin_ops)
+
+        for doc_id, collection in annot_it:
+            opin_ops.save_annotated_opinion_collection(collection=collection,
+                                                       doc_id=doc_id,
+                                                       data_type=data_type)
+
+    # endregion
diff --git a/common/experiment/annot/base_annot.py b/common/experiment/annot/base_annot.py
@@ -0,0 +1,4 @@
+class BaseAnnotationAlgorithm(object):
+
+    def iter_opinions(self, parsed_news, entities_collection, existed_opinions=None):
+        pass
diff --git a/common/experiment/neutral/algo/default.py → common/experiment/annot/single_label.py b/common/experiment/neutral/algo/default.py → common/experiment/annot/single_label.py
@@ -1,29 +1,33 @@
 from arekit.common.entities.base import Entity
 from arekit.common.entities.collection import EntityCollection
-from arekit.common.experiment.neutral.algo.base import BaseNeutralAnnotationAlgorithm
-from arekit.common.labels.base import NeutralLabel
+from arekit.common.experiment.annot.base_annot import BaseAnnotationAlgorithm
+from arekit.common.labels.base import NoLabel, Label
 from arekit.common.news.parsed.base import ParsedNews
 from arekit.common.opinions.base import Opinion
 from arekit.common.dataset.text_opinions.enums import DistanceType
 from arekit.common.dataset.text_opinions.helper import TextOpinionHelper
 
 
-class DefaultNeutralAnnotationAlgorithm(BaseNeutralAnnotationAlgorithm):
+class DefaultSingleLabelAnnotationAlgorithm(BaseAnnotationAlgorithm):
     """
     Neutral annotation algorithm which assumes to compose pairs
     within a sentence which are not a part of sentiment.
     """
 
-    def __init__(self, dist_in_terms_bound, ignored_entity_values=None):
+    def __init__(self, dist_in_terms_bound, label_instance, dist_in_sents=0, ignored_entity_values=None):
         """
         dist_in_terms_bound: int
             max allowed distance in term (less than passed value)
         """
         assert(isinstance(ignored_entity_values, list) or ignored_entity_values is None)
         assert(isinstance(dist_in_terms_bound, int) or dist_in_terms_bound is None)
+        assert(isinstance(label_instance, Label))
+        assert(isinstance(dist_in_sents, int))
 
         self.__ignored_entity_values = [] if ignored_entity_values is None else ignored_entity_values
         self.__dist_in_terms_bound = dist_in_terms_bound
+        self.__dist_in_sents = dist_in_sents
+        self.__label_instance = label_instance
 
     # region private methods
 
@@ -37,7 +41,7 @@ def __is_ignored_entity_value(self, entity_value):
         assert(isinstance(entity_value, unicode))
         return entity_value in self.__ignored_entity_values
 
-    def __iter_opinions_between_entties(self, relevant_pairs, entities_collection):
+    def __iter_opinions_between_entities(self, relevant_pairs, entities_collection):
         assert(isinstance(entities_collection, EntityCollection))
 
         for e1 in entities_collection:
@@ -52,11 +56,11 @@ def __iter_opinions_between_entties(self, relevant_pairs, entities_collection):
 
                 opinion = Opinion(source_value=e1.Value,
                                   target_value=e2.Value,
-                                  sentiment=NeutralLabel())
+                                  sentiment=self.__label_instance)
 
                 yield opinion
 
-    def __try_create_pair_key(self, parsed_news, e1, e2, sentiment_opinions):
+    def __try_create_pair_key(self, parsed_news, e1, e2, existed_opinions):
         assert(isinstance(e1, Entity))
         assert(isinstance(e2, Entity))
 
@@ -72,7 +76,7 @@ def __try_create_pair_key(self, parsed_news, e1, e2, sentiment_opinions):
                                                               e1=e1, e2=e2,
                                                               distance_type=DistanceType.InSentences)
 
-        if s_dist > 0:
+        if s_dist > self.__dist_in_sents:
             return
 
         t_dist = TextOpinionHelper.calc_dist_between_entities(parsed_news=parsed_news,
@@ -82,18 +86,18 @@ def __try_create_pair_key(self, parsed_news, e1, e2, sentiment_opinions):
         if self.__dist_in_terms_bound is not None and t_dist > self.__dist_in_terms_bound:
             return
 
-        if sentiment_opinions is not None:
+        if existed_opinions is not None:
             o = Opinion(source_value=e1.Value,
                         target_value=e2.Value,
-                        sentiment=NeutralLabel())
-            if sentiment_opinions.has_synonymous_opinion(opinion=o):
+                        sentiment=self.__label_instance)
+            if existed_opinions.has_synonymous_opinion(opinion=o):
                 return
 
         return self.__create_key_by_entity_pair(e1=e1, e2=e2)
 
     # endregion
 
-    def iter_neutral_opinions(self, parsed_news, entities_collection, sentiment_opinions=None):
+    def iter_opinions(self, parsed_news, entities_collection, existed_opinions=None):
         assert(isinstance(parsed_news, ParsedNews))
         assert(isinstance(entities_collection, EntityCollection))
 
@@ -107,12 +111,12 @@ def iter_neutral_opinions(self, parsed_news, entities_collection, sentiment_opin
 
                 key = self.__try_create_pair_key(parsed_news=parsed_news,
                                                  e1=e1, e2=e2,
-                                                 sentiment_opinions=sentiment_opinions)
+                                                 existed_opinions=existed_opinions)
 
                 if key is None:
                     continue
 
                 relevant_pairs[key] = 0
 
-        return self.__iter_opinions_between_entties(relevant_pairs=relevant_pairs,
-                                                    entities_collection=entities_collection)
+        return self.__iter_opinions_between_entities(relevant_pairs=relevant_pairs,
+                                                     entities_collection=entities_collection)
diff --git a/common/experiment/data/base.py b/common/experiment/data/base.py
@@ -1,4 +1,3 @@
-from arekit.common.labels.scaler import BaseLabelScaler
 from arekit.common.model.model_io import BaseModelIO
 
 
@@ -8,19 +7,10 @@ class DataIO(object):
         (data-serialization, training, etc.).
     """
 
-    def __init__(self, labels_scaler, stemmer):
-        assert(isinstance(labels_scaler, BaseLabelScaler))
-        self.__labels_scale = labels_scaler
+    def __init__(self, stemmer):
         self.__stemmer = stemmer
         self.__model_io = None
 
-    @property
-    def LabelsScaler(self):
-        """ Declares the amount of labels utilized in experiment. The latter
-            is necessary for conversions from int (uint) to Labels and vice versa.
-        """
-        return self.__labels_scale
-
     @property
     def ModelIO(self):
         """ Provides model paths for the resources utilized during training process.
@@ -29,6 +19,10 @@ def ModelIO(self):
         """
         return self.__model_io
 
+    @property
+    def LabelsCount(self):
+        raise NotImplementedError()
+
     @property
     def Stemmer(self):
         return self.__stemmer