#442 related fix

nicolay-r · Feb 18, 2023 · 37157c4 · 37157c4
1 parent 57e77d5
commit 37157c4
Show file tree

Hide file tree

Showing 6 changed files with 23 additions and 20 deletions.
diff --git a/...es/sources/sentinerel/folding/__init__.py → ...rib/source/sentinerel/folding/__init__.py b/...es/sources/sentinerel/folding/__init__.py → ...rib/source/sentinerel/folding/__init__.py
diff --git a/...nes/sources/sentinerel/folding/factory.py → ...trib/source/sentinerel/folding/factory.py b/...nes/sources/sentinerel/folding/factory.py → ...trib/source/sentinerel/folding/factory.py
@@ -1,4 +1,4 @@
-from arekit.contrib.utils.pipelines.sources.sentinerel.folding.fixed import create_fixed_folding
+from arekit.contrib.source.sentinerel.folding.fixed import create_fixed_folding
 
 
 class SentiNERELFoldingFactory:
@@ -7,15 +7,13 @@ class SentiNERELFoldingFactory:
     """
 
     @staticmethod
-    def create_fixed_folding(fixed_split_filepath, limit=None):
+    def create_fixed_folding(file, limit=None):
         """
-            fixed_split_filepath: str
-                filepath to the fixed collection split.
             limit: int
                 Allows to limit amount of documents (utilized for testing reasons)
         """
 
-        train_filenames, test_filenames = SentiNERELFoldingFactory._read_train_test(fixed_split_filepath)
+        train_filenames, test_filenames = SentiNERELFoldingFactory._read_train_test(f=file)
         if limit is not None:
             train_filenames = train_filenames[:limit]
             test_filenames = test_filenames[:limit]
@@ -25,9 +23,8 @@ def create_fixed_folding(fixed_split_filepath, limit=None):
         return filenames_by_ids, data_folding
 
     @staticmethod
-    def _read_train_test(filepath):
-        with open(filepath, "r") as f:
-            parts = []
-            for line in f.readlines():
-                parts.append(line.strip().split(','))
+    def _read_train_test(f):
+        parts = []
+        for line in f.readlines():
+            parts.append(line.strip().split(','))
         return parts[0], parts[1]
diff --git a/...lines/sources/sentinerel/folding/fixed.py → ...ontrib/source/sentinerel/folding/fixed.py b/...lines/sources/sentinerel/folding/fixed.py → ...ontrib/source/sentinerel/folding/fixed.py
diff --git a/arekit/contrib/source/sentinerel/io_utils.py b/arekit/contrib/source/sentinerel/io_utils.py
@@ -1,9 +1,10 @@
 from enum import Enum
 from os import path
-from os.path import basename
+from os.path import basename, join, dirname
 
 import enum
 
+from arekit.contrib.source.sentinerel.folding.factory import SentiNERELFoldingFactory
 from arekit.contrib.source.zip_utils import ZipArchiveUtils
 
 
@@ -72,4 +73,15 @@ def iter_collection_filenames(version=DEFAULT_VERSION):
         for doc_id, filename in enumerate(filenames_it):
             yield doc_id, filename
 
+    @staticmethod
+    def read_dataset_split(version=DEFAULT_VERSION, docs_limit=None):
+        """ Provides a fixed split of the dataset onto
+            `test` and `training` part:
+            https://github.com/nicolay-r/SentiNEREL-attitude-extraction
+        """
+        return ZipArchiveUtils.read_from_zip(
+            inner_path=join(SentiNerelIOUtils.inner_root, "split_fixed.txt"),
+            process_func=lambda f: SentiNERELFoldingFactory.create_fixed_folding(file=f, limit=docs_limit),
+            version=version)
+
     # endregion
diff --git a/arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py b/arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py
@@ -9,9 +9,8 @@
 from arekit.common.synonyms.base import SynonymsCollection
 from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
 from arekit.common.text.parser import BaseTextParser
-from arekit.contrib.source.sentinerel.io_utils import SentiNerelVersions
+from arekit.contrib.source.sentinerel.io_utils import SentiNerelVersions, SentiNerelIOUtils
 from arekit.contrib.utils.pipelines.sources.sentinerel.doc_ops import SentiNERELDocOperation
-from arekit.contrib.utils.pipelines.sources.sentinerel.folding.factory import SentiNERELFoldingFactory
 from arekit.contrib.utils.pipelines.sources.sentinerel.labels_fmt import SentiNERELSentimentLabelFormatter
 from arekit.contrib.utils.pipelines.text_opinion.annot.algo_based import AlgorithmBasedTextOpinionAnnotator
 from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
@@ -61,11 +60,8 @@ def create_text_opinion_extraction_pipeline(sentinerel_version,
 
     if doc_ops is None:
         # Default Initialization.
-        filenames_by_ids, data_folding = SentiNERELFoldingFactory.create_fixed_folding(
-            # This is a temporary solution with the "split_filepath.txt"
-            # TODO. This going to be fixed by mentioning this split into archive or so.
-            fixed_split_filepath=join(dirname(__file__), 'split_fixed.txt'),
-            limit=docs_limit)
+        filenames_by_ids, data_folding = SentiNerelIOUtils.read_dataset_split(version=sentinerel_version,
+                                                                              docs_limit=docs_limit)
         doc_ops = SentiNERELDocOperation(filename_by_id=filenames_by_ids,
                                          version=sentinerel_version)
 

diff --git a/arekit/contrib/utils/pipelines/sources/sentinerel/split_fixed.txt b/arekit/contrib/utils/pipelines/sources/sentinerel/split_fixed.txt