Skip to content

Commit

Permalink
#442 related fix
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Feb 18, 2023
1 parent 57e77d5 commit 37157c4
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 20 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from arekit.contrib.utils.pipelines.sources.sentinerel.folding.fixed import create_fixed_folding
from arekit.contrib.source.sentinerel.folding.fixed import create_fixed_folding


class SentiNERELFoldingFactory:
Expand All @@ -7,15 +7,13 @@ class SentiNERELFoldingFactory:
"""

@staticmethod
def create_fixed_folding(fixed_split_filepath, limit=None):
def create_fixed_folding(file, limit=None):
"""
fixed_split_filepath: str
filepath to the fixed collection split.
limit: int
Allows to limit amount of documents (utilized for testing reasons)
"""

train_filenames, test_filenames = SentiNERELFoldingFactory._read_train_test(fixed_split_filepath)
train_filenames, test_filenames = SentiNERELFoldingFactory._read_train_test(f=file)
if limit is not None:
train_filenames = train_filenames[:limit]
test_filenames = test_filenames[:limit]
Expand All @@ -25,9 +23,8 @@ def create_fixed_folding(fixed_split_filepath, limit=None):
return filenames_by_ids, data_folding

@staticmethod
def _read_train_test(filepath):
with open(filepath, "r") as f:
parts = []
for line in f.readlines():
parts.append(line.strip().split(','))
def _read_train_test(f):
parts = []
for line in f.readlines():
parts.append(line.strip().split(','))
return parts[0], parts[1]
14 changes: 13 additions & 1 deletion arekit/contrib/source/sentinerel/io_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from enum import Enum
from os import path
from os.path import basename
from os.path import basename, join, dirname

import enum

from arekit.contrib.source.sentinerel.folding.factory import SentiNERELFoldingFactory
from arekit.contrib.source.zip_utils import ZipArchiveUtils


Expand Down Expand Up @@ -72,4 +73,15 @@ def iter_collection_filenames(version=DEFAULT_VERSION):
for doc_id, filename in enumerate(filenames_it):
yield doc_id, filename

@staticmethod
def read_dataset_split(version=DEFAULT_VERSION, docs_limit=None):
""" Provides a fixed split of the dataset onto
`test` and `training` part:
https://github.com/nicolay-r/SentiNEREL-attitude-extraction
"""
return ZipArchiveUtils.read_from_zip(
inner_path=join(SentiNerelIOUtils.inner_root, "split_fixed.txt"),
process_func=lambda f: SentiNERELFoldingFactory.create_fixed_folding(file=f, limit=docs_limit),
version=version)

# endregion
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@
from arekit.common.synonyms.base import SynonymsCollection
from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
from arekit.common.text.parser import BaseTextParser
from arekit.contrib.source.sentinerel.io_utils import SentiNerelVersions
from arekit.contrib.source.sentinerel.io_utils import SentiNerelVersions, SentiNerelIOUtils
from arekit.contrib.utils.pipelines.sources.sentinerel.doc_ops import SentiNERELDocOperation
from arekit.contrib.utils.pipelines.sources.sentinerel.folding.factory import SentiNERELFoldingFactory
from arekit.contrib.utils.pipelines.sources.sentinerel.labels_fmt import SentiNERELSentimentLabelFormatter
from arekit.contrib.utils.pipelines.text_opinion.annot.algo_based import AlgorithmBasedTextOpinionAnnotator
from arekit.contrib.utils.pipelines.text_opinion.annot.predefined import PredefinedTextOpinionAnnotator
Expand Down Expand Up @@ -61,11 +60,8 @@ def create_text_opinion_extraction_pipeline(sentinerel_version,

if doc_ops is None:
# Default Initialization.
filenames_by_ids, data_folding = SentiNERELFoldingFactory.create_fixed_folding(
# This is a temporary solution with the "split_filepath.txt"
# TODO. This going to be fixed by mentioning this split into archive or so.
fixed_split_filepath=join(dirname(__file__), 'split_fixed.txt'),
limit=docs_limit)
filenames_by_ids, data_folding = SentiNerelIOUtils.read_dataset_split(version=sentinerel_version,
docs_limit=docs_limit)
doc_ops = SentiNERELDocOperation(filename_by_id=filenames_by_ids,
version=sentinerel_version)

Expand Down

This file was deleted.

0 comments on commit 37157c4

Please sign in to comment.