Merge pull request #438 from nicolay-r/master

Sync with latest updates
nicolay-r · Jan 18, 2023 · a2f6fe8 · a2f6fe8
2 parents bca6dc1 + 395671f
commit a2f6fe8
Show file tree

Hide file tree

Showing 9 changed files with 94 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -55,6 +55,8 @@ Please follows th
 
 ## Applications
 
+* **AREnets** [[github]](https://github.com/nicolay-r/AREnets)
+  * is an OpenNRE like project, but the kernel based on tensorflow library, with implementation of neural networks on top of it, designed for Attitude 
 * **ARElight** [[site]](https://nicolay-r.github.io/arelight-page/) [[github]](https://github.com/nicolay-r/ARElight)
     * **Infer attitudes** from large Mass-media documents or **sample texts** for your Machine Learning models applications
 

diff --git a/arekit/contrib/networks/input/const.py b/arekit/contrib/networks/input/const.py
@@ -4,5 +4,6 @@
 SynonymObject = "syn_objs"
 SynonymSubject = "syn_subjs"
 PosTags = "pos_tags"
+Text = "text"
 
 ArgsSep = ','
diff --git a/arekit/contrib/networks/input/rows_parser.py b/arekit/contrib/networks/input/rows_parser.py
@@ -1,24 +1,35 @@
-import pandas as pd
-
 from arekit.common.data import const
 from arekit.common.utils import filter_whitespaces, split_by_whitespaces
-from . import const as network_input_const
+
+import arekit.contrib.networks.input.const as network_input_const
 
 empty_list = []
 
 
+def no_value():
+    return None
+
+
 def __process_values_list(value):
     return value.split(network_input_const.ArgsSep)
 
 
 def __process_indices_list(value):
-    return [int(v) for v in str(value).split(network_input_const.ArgsSep)]
+    return no_value() if not value else [int(v) for v in str(value).split(network_input_const.ArgsSep)]
 
 
 def __process_int_values_list(value):
     return __process_indices_list(value)
 
 
+def __handle_text(value):
+    """ The core method of the input text processing.
+    """
+    assert(isinstance(value, str) or isinstance(value, list))
+    return filter_whitespaces([term for term in split_by_whitespaces(value)]
+                              if isinstance(value, str) else value)
+
+
 parse_value = {
     const.ID: lambda value: value,
     const.DOC_ID: lambda value: int(value),
@@ -35,18 +46,19 @@ def __process_int_values_list(value):
     network_input_const.SynonymObject: lambda value: __process_indices_list(value),
     network_input_const.SynonymSubject: lambda value: __process_indices_list(value),
     network_input_const.PosTags: lambda value: __process_int_values_list(value),
-    "text_a": lambda value: filter_whitespaces([term for term in split_by_whitespaces(value)])
+    network_input_const.Text: lambda value: __handle_text(value)
 }
 
 
 class ParsedSampleRow(object):
-    """
-    Provides a parsed information for a sample row.
-    TODO. Use this class as API
+    """ Provides a parsed information for a sample row.
     """
 
     def __init__(self, row):
-        assert(isinstance(row, pd.Series))
+        """ row: dict
+                dict of the pairs ("field_name", value)
+        """
+        assert(isinstance(row, dict))
 
         self.__uint_label = None
         self.__params = {}
@@ -64,13 +76,16 @@ def __init__(self, row):
 
             self.__params[key] = parse_value[key](value)
 
+    def __value_or_none(self, key):
+        return self.__params[key] if key in self.__params else no_value()
+
     @property
     def SampleID(self):
         return self.__params[const.ID]
 
     @property
     def Terms(self):
-        return self.__params["text_a"]
+        return self.__params[network_input_const.Text]
 
     @property
     def SubjectIndex(self):
@@ -86,33 +101,33 @@ def UintLabel(self):
 
     @property
     def PartOfSpeechTags(self):
-        return self.__params[network_input_const.PosTags]
+        return self.__value_or_none(network_input_const.PosTags)
 
     @property
     def TextFrameVariantIndices(self):
-        return self.__params[network_input_const.FrameVariantIndices]
+        return self.__value_or_none(network_input_const.FrameVariantIndices)
 
     @property
     def TextFrameConnotations(self):
-        return self.__params[network_input_const.FrameConnotations]
+        return self.__value_or_none(network_input_const.FrameConnotations)
 
     @property
     def EntityInds(self):
-        return self.__params[const.ENTITIES]
+        return self.__value_or_none(const.ENTITIES)
 
     @property
     def SynonymObjectInds(self):
-        return self.__params[network_input_const.SynonymObject]
+        return self.__value_or_none(network_input_const.SynonymObject)
 
     @property
     def SynonymSubjectInds(self):
-        return self.__params[network_input_const.SynonymSubject]
+        return self.__value_or_none(network_input_const.SynonymSubject)
 
     def __getitem__(self, item):
         assert (isinstance(item, str) or item is None)
         if item not in self.__params:
-            return None
-        return self.__params[item] if item is not None else None
+            return no_value()
+        return self.__params[item] if item is not None else no_value()
 
     @classmethod
     def parse(cls, row):

diff --git a/arekit/contrib/source/brat/annot.py b/arekit/contrib/source/brat/annot.py
@@ -14,33 +14,40 @@ def __non_prefixed_id(value):
 
     @staticmethod
     def handle_entity(args):
+        """ T2	Location 10 23	South America
+            T1	Location 0 5;16 23	North America
+        """
+        assert(len(args) == 3)
 
-        if len(args) < 4:
-            return None
+        e_id = int(BratAnnotationParser.__non_prefixed_id(args[0]))
+        entity_params = args[1].split()
 
-        if not str.isdigit(args[2]) or not str.isdigit(args[3]):
+        if len(entity_params) > 3:
+            # We do not support the case of a non-continuous entity mentions.
             return None
 
-        e_id = int(BratAnnotationParser.__non_prefixed_id(args[0]))
-        e_str_type = args[1]
-        e_begin = int(args[2])
-        e_end = int(args[3])
-        e_value = " ".join([arg.strip().replace(',', '') for arg in args[4:]])
+        e_str_type, e_begin, e_end = entity_params
 
         return BratEntity(id_in_doc=e_id,
                           e_type=e_str_type,
-                          index_begin=e_begin,
-                          index_end=e_end,
-                          value=e_value)
+                          index_begin=int(e_begin),
+                          index_end=int(e_end),
+                          value=args[2].strip())
 
     @staticmethod
     def handle_relation(args):
+        """ Example:
+            R1	Origin Arg1:T3 Arg2:T4
+        """
 
+        # Parse identifier index.
         e_id = args[0][1:]
 
-        rel_type = args[1]
-        source_id = args[2].split(':')[1]
-        target_id = args[3].split(':')[1]
+        # Parse relation arguments.
+        rel_type, source, target = args[1].split()
+
+        source_id = source.split(':')[1]
+        target_id = target.split(':')[1]
 
         return BratRelation(id_in_doc=e_id,
                             source_id=int(BratAnnotationParser.__non_prefixed_id(source_id)),
@@ -57,7 +64,7 @@ def parse_annotations(input_file, encoding='utf-8'):
         for line in input_file.readlines():
             line = line.decode(encoding)
 
-            args = line.split()
+            args = line.split('\t')
 
             record_type = args[0][0]
 

diff --git a/arekit/contrib/utils/data/readers/jsonl.py b/arekit/contrib/utils/data/readers/jsonl.py
@@ -0,0 +1,15 @@
+from arekit.contrib.utils.data.readers.base import BaseReader
+from arekit.contrib.utils.data.storages.jsonl_based import JsonlBasedRowsStorage
+
+
+class JsonlReader(BaseReader):
+
+    def read(self, target):
+        rows = []
+        with open(target, "r") as f:
+            for line in f.readlines():
+                rows.append(line)
+        return JsonlBasedRowsStorage(rows)
+
+    def target_extension(self):
+        return ".jsonl"
diff --git a/arekit/contrib/utils/data/storages/jsonl_based.py b/arekit/contrib/utils/data/storages/jsonl_based.py
@@ -0,0 +1,18 @@
+import json
+
+from arekit.common.data.storages.base import BaseRowsStorage
+
+
+class JsonlBasedRowsStorage(BaseRowsStorage):
+
+    def __init__(self, rows):
+        assert(isinstance(rows, list))
+        self.__rows = rows
+
+    def _iter_rows(self):
+        for row_index, row in enumerate(self.__rows):
+            assert(isinstance(row, str))
+            yield row_index, json.loads(row)
+
+    def _get_rows_count(self):
+        return len(self.__rows)
diff --git a/arekit/contrib/utils/evaluation/analyze_errors.py b/arekit/contrib/utils/evaluation/analyze_errors.py
@@ -131,13 +131,12 @@ def extract_errors(eval_result, test_samples_filepath, etalon_samples_filepath,
         for sample_col in columns_to_copy:
             eval_errors_df.at[row_id, sample_col] = sample_row[sample_col]
 
-        text_terms =__post_text_processing(sample_row=sample_row, source_ind=source_ind, target_ind=target_ind)
+        text_terms = __post_text_processing(sample_row=sample_row, source_ind=source_ind, target_ind=target_ind)
         cropped_text = __crop_text_terms(source_ind=source_ind, target_ind=target_ind, text_terms=text_terms)
 
         eval_errors_df.at[row_id, BaseSingleTextProvider.TEXT_A] = cropped_text
 
-        # Replace with the values instead of indices.
-        entity_inds = __get_entity_inds(sample_row)
+        # Replace source and target the values instead of indices.
         eval_errors_df.at[row_id, const.S_IND] = text_terms[source_ind]
         eval_errors_df.at[row_id, const.T_IND] = text_terms[target_ind]
 

diff --git a/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py b/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py
@@ -44,7 +44,6 @@ def create_text_opinion_extraction_pipeline(text_parser,
         version=version,
         doc_id_func=lambda doc_id: doc_id,
         keep_doc_ids_only=False,
-        label_scaler=label_scaler,
         limit=limit)
 
     doc_ops = DictionaryBasedDocumentOperations(ru_attitudes)

diff --git a/arekit/contrib/utils/pipelines/sources/ruattitudes/utils.py b/arekit/contrib/utils/pipelines/sources/ruattitudes/utils.py
@@ -18,7 +18,7 @@ def get_doc(self, doc_id):
         return self.__ru_attitudes[doc_id]
 
 
-def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, label_scaler, limit=None):
+def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, limit=None):
     """ Performs reading of RuAttitude formatted documents and
         selection according to 'doc_ids_set' parameter.
     """