Skip to content

Commit

Permalink
#507 refactoring NEREL. #508 added nerel-bio
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Aug 6, 2023
1 parent 737d569 commit 1f840bb
Show file tree
Hide file tree
Showing 13 changed files with 337 additions and 63 deletions.
4 changes: 3 additions & 1 deletion arekit/contrib/source/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ def download():
"sentinerel-v2_1.zip": "https://www.dropbox.com/s/<HIDDEN>/sentinerel-v2_1.zip?dl=1",
# NEREL
"nerel-v1_0.zip": "https://www.dropbox.com/scl/fi/vegk0aczjdm9km410loqv/nerel-v1_0.zip?rlkey=wv0ut86n3x5ao6xabsaxd7lh7&dl=1",
"nerel-v1_1.zip": "https://www.dropbox.com/scl/fi/oaytj0rvx7vhdxjk98x7g/nerel-v1_1.zip?rlkey=klrq0l5rpn10cf7e2swkay6r4&dl=1"
"nerel-v1_1.zip": "https://www.dropbox.com/scl/fi/oaytj0rvx7vhdxjk98x7g/nerel-v1_1.zip?rlkey=klrq0l5rpn10cf7e2swkay6r4&dl=1",
# NEREL-BIO
"nerel-bio-v1_0.zip": "https://www.dropbox.com/scl/fi/nltuulfixbkhg3raczash/nerel-bio-v1_0.zip?rlkey=86uizq1hbkgkx302c5p5znpp6&dl=1"
}

# Perform downloading ...
Expand Down
9 changes: 5 additions & 4 deletions arekit/contrib/source/nerel/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,18 @@ def __keep_entity(self, entity):
return entity.Type not in self.__discard_entities

@classmethod
def read_collection(cls, filename, version, entities_to_ignore=None):
def read_collection(cls, filename, version, io_utils, entities_to_ignore=None):
assert(isinstance(io_utils, NerelIOUtils))
assert(isinstance(filename, str))

# Since this dataset does not provide the synonyms collection by default,
# it is necessary to declare an empty collection to populate so in further.
synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False)

doc_fold = NerelIOUtils.map_doc_to_fold_type(version)
doc_fold = io_utils.map_doc_to_fold_type(version)

return NerelIOUtils.read_from_zip(
inner_path=NerelIOUtils.get_annotation_innerpath(folding_data_type=doc_fold[filename], filename=filename),
return io_utils.read_from_zip(
inner_path=io_utils.get_annotation_innerpath(folding_data_type=doc_fold[filename], filename=filename),
process_func=lambda input_file: cls(
contents=BratAnnotationParser.parse_annotations(input_file=input_file, encoding='utf-8-sig'),
entities_to_ignore=entities_to_ignore,
Expand Down
62 changes: 15 additions & 47 deletions arekit/contrib/source/nerel/io_utils.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,11 @@
import enum
from os import path
from os.path import basename

from arekit.common.experiment.data_type import DataType
from arekit.contrib.source.nerel.folding.fixed import create_fixed_folding
from arekit.contrib.source.nerel.utils import iter_filename_and_splittype
from arekit.contrib.source.zip_utils import ZipArchiveUtils


class NerelVersions(enum.Enum):
""" List of the supported version of this collection
"""

V1 = "v1_0"
V11 = "v1_1"


DEFAULT_VERSION = NerelVersions.V1


class NerelIOUtils(ZipArchiveUtils):

splits = {
Expand All @@ -41,47 +29,27 @@ def get_news_innerpath(folding_data_type, filename):
return path.join(NerelIOUtils.splits[folding_data_type], "{}.txt".format(filename))

@staticmethod
def __iter_filenames_from_dataset(version):
assert(isinstance(version, enum.Enum))

for filename in NerelIOUtils.iter_filenames_from_zip(version):

extension = filename[-4:]

# Crop extension.
filename = filename[:-4]

if extension != ".txt":
continue
def map_doc_to_fold_type(version):

yield filename, basename(filename)
it = iter_filename_and_splittype(
filenames_it=NerelIOUtils.iter_filenames_from_zip(version),
splits=NerelIOUtils.splits.items())

@staticmethod
def __iter_filename_and_splittype(version):
filenames_it = NerelIOUtils.__iter_filenames_from_dataset(version=version)
for doc_id, data in enumerate(filenames_it):
filepath, filename = data
for split_type, split_name in NerelIOUtils.splits.items():
if split_name in filepath:
yield filename, split_type

@staticmethod
def iter_collection_filenames(version=DEFAULT_VERSION):
filenames_it = NerelIOUtils.__iter_filenames_from_dataset(version=version)
for doc_id, filename in enumerate(filenames_it):
yield doc_id, filename

@staticmethod
def map_doc_to_fold_type(version=DEFAULT_VERSION):
d2f = {}
for filename, split_type in NerelIOUtils.__iter_filename_and_splittype(version):
for filename, split_type in it:
d2f[filename] = split_type

return d2f

@staticmethod
def read_dataset_split(version=DEFAULT_VERSION, docs_limit=None):
def read_dataset_split(version, docs_limit=None):

it = iter_filename_and_splittype(
filenames_it=NerelIOUtils.iter_filenames_from_zip(version),
splits=NerelIOUtils.splits.items())

f2d = {}
for filename, split_type in NerelIOUtils.__iter_filename_and_splittype(version):
for filename, split_type in it:
if split_type not in f2d:
f2d[split_type] = []
f2d[split_type].append(filename)
Expand All @@ -91,4 +59,4 @@ def read_dataset_split(version=DEFAULT_VERSION, docs_limit=None):
dev_filenames=f2d[DataType.Dev],
limit=docs_limit)

return filenames_by_ids, data_folding
return filenames_by_ids, data_folding
19 changes: 11 additions & 8 deletions arekit/contrib/source/nerel/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,22 @@
from arekit.contrib.source.brat.news import BratNews
from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
from arekit.contrib.source.nerel.entities import NerelEntityCollection
from arekit.contrib.source.nerel.io_utils import NerelIOUtils, DEFAULT_VERSION
from arekit.contrib.source.nerel.io_utils import NerelIOUtils


class NerelDocReader(object):

def __init__(self, version=DEFAULT_VERSION):
def __init__(self, version, io_utils=NerelIOUtils()):
assert(isinstance(io_utils, NerelIOUtils))
self.__version = version
self.__doc_fold = NerelIOUtils.map_doc_to_fold_type(version)
self.__io_utils = io_utils
self.__doc_fold = io_utils.map_doc_to_fold_type(version)

def read_text_relations(self, filename):
assert(isinstance(filename, str))

return NerelIOUtils.read_from_zip(
inner_path=NerelIOUtils.get_annotation_innerpath(
return self.__io_utils.read_from_zip(
inner_path=self.__io_utils.get_annotation_innerpath(
folding_data_type=self.__doc_fold[filename],
filename=filename),
process_func=lambda input_file: [
Expand All @@ -32,12 +34,13 @@ def file_to_doc(input_file):
return BratNews(doc_id=doc_id, sentences=sentences, text_relations=text_relations)

entities = NerelEntityCollection.read_collection(
filename=filename, version=self.__version, entities_to_ignore=entities_to_ignore)
filename=filename, version=self.__version,
entities_to_ignore=entities_to_ignore, io_utils=self.__io_utils)

text_relations = self.read_text_relations(filename=filename)

return NerelIOUtils.read_from_zip(
inner_path=NerelIOUtils.get_news_innerpath(
return self.__io_utils.read_from_zip(
inner_path=self.__io_utils.get_news_innerpath(
folding_data_type=self.__doc_fold[filename], filename=filename),
process_func=file_to_doc,
version=self.__version)
24 changes: 24 additions & 0 deletions arekit/contrib/source/nerel/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from os.path import basename


def __iter_filtered_filenames(filenames_iter):
for filename in filenames_iter:
extension = filename[-4:]
# Crop extension.
filename = filename[:-4]
if extension != ".txt":
continue
yield filename, basename(filename)


def iter_filename_and_splittype(filenames_it, splits):
for doc_id, data in enumerate(__iter_filtered_filenames(filenames_it)):
filepath, filename = data
for split_type, split_name in splits:
if split_name in filepath:
yield filename, split_type


def iter_collection_filenames(filenames_it):
for doc_id, filename in enumerate(__iter_filtered_filenames(filenames_it)):
yield doc_id, filename
12 changes: 12 additions & 0 deletions arekit/contrib/source/nerel/versions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import enum


class NerelVersions(enum.Enum):
""" List of the supported version of this collection
"""

V1 = "v1_0"
V11 = "v1_1"


DEFAULT_VERSION = NerelVersions.V1
Empty file.
62 changes: 62 additions & 0 deletions arekit/contrib/source/nerelbio/io_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from os import path

from arekit.common.experiment.data_type import DataType
from arekit.contrib.source.nerel.folding.fixed import create_fixed_folding
from arekit.contrib.source.nerel.io_utils import NerelIOUtils
from arekit.contrib.source.nerel.utils import iter_filename_and_splittype


class NerelBioIOUtils(NerelIOUtils):

splits = {
DataType.Train: "train",
DataType.Dev: "dev",
DataType.Test: "test"
}

@staticmethod
def get_archive_filepath(version):
return path.join(NerelBioIOUtils.get_data_root(), "nerel-bio-{}.zip".format(version))

@staticmethod
def get_annotation_innerpath(folding_data_type, filename):
assert(isinstance(filename, str))
return path.join(NerelBioIOUtils.splits[folding_data_type], "{}.ann".format(filename))

@staticmethod
def get_news_innerpath(folding_data_type, filename):
assert(isinstance(filename, str))
return path.join(NerelBioIOUtils.splits[folding_data_type], "{}.txt".format(filename))

@staticmethod
def map_doc_to_fold_type(version):

it = iter_filename_and_splittype(
filenames_it=NerelBioIOUtils.iter_filenames_from_zip(version),
splits=NerelBioIOUtils.splits.items())

d2f = {}
for filename, split_type in it:
d2f[filename] = split_type

return d2f

@staticmethod
def read_dataset_split(version, docs_limit=None):

it = iter_filename_and_splittype(
filenames_it=NerelBioIOUtils.iter_filenames_from_zip(version),
splits=NerelBioIOUtils.splits.items())

f2d = {}
for filename, split_type in it:
if split_type not in f2d:
f2d[split_type] = []
f2d[split_type].append(filename)

filenames_by_ids, data_folding = create_fixed_folding(train_filenames=f2d[DataType.Train],
test_filenames=f2d[DataType.Test],
dev_filenames=f2d[DataType.Dev],
limit=docs_limit)

return filenames_by_ids, data_folding
Loading

0 comments on commit 1f840bb

Please sign in to comment.