-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
21 changed files
with
905 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
from arekit.common.entities.collection import EntityCollection | ||
from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders | ||
from arekit.contrib.source.brat.annot import BratAnnotationParser | ||
from arekit.contrib.source.brat.entities.entity import BratEntity | ||
from arekit.contrib.source.nerel.io_utils import NerelIOUtils | ||
from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper | ||
from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection | ||
|
||
|
||
class NerelEntityCollection(EntityCollection): | ||
|
||
def __init__(self, contents, value_to_group_id_func, entities_to_ignore=None): | ||
""" | ||
entities_to_ignore: list or None | ||
this parameter is required because of the simplified implementation of | ||
the nested objects of the BRAT annotation. | ||
""" | ||
assert(isinstance(contents, dict)) | ||
assert(BratAnnotationParser.ENTITIES in contents) | ||
assert(isinstance(entities_to_ignore, list) or entities_to_ignore is None) | ||
|
||
self.__discard_entities = set([] if entities_to_ignore is None else entities_to_ignore) | ||
contents[BratAnnotationParser.ENTITIES] = [e for e in contents[BratAnnotationParser.ENTITIES] | ||
if self.__keep_entity(e)] | ||
|
||
super(NerelEntityCollection, self).__init__( | ||
entities=contents[BratAnnotationParser.ENTITIES], | ||
value_to_group_id_func=value_to_group_id_func) | ||
|
||
self._sort_entities(key=lambda entity: entity.IndexBegin) | ||
|
||
def __keep_entity(self, entity): | ||
assert(isinstance(entity, BratEntity)) | ||
return entity.Type not in self.__discard_entities | ||
|
||
@classmethod | ||
def read_collection(cls, filename, version, io_utils, entities_to_ignore=None): | ||
assert(isinstance(io_utils, NerelIOUtils)) | ||
assert(isinstance(filename, str)) | ||
|
||
# Since this dataset does not provide the synonyms collection by default, | ||
# it is necessary to declare an empty collection to populate so in further. | ||
synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False) | ||
|
||
doc_fold = io_utils.map_doc_to_fold_type(version) | ||
|
||
return io_utils.read_from_zip( | ||
inner_path=io_utils.get_annotation_innerpath(folding_data_type=doc_fold[filename], filename=filename), | ||
process_func=lambda input_file: cls( | ||
contents=BratAnnotationParser.parse_annotations(input_file=input_file, encoding='utf-8-sig'), | ||
entities_to_ignore=entities_to_ignore, | ||
value_to_group_id_func=lambda value: | ||
SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value( | ||
synonyms, value)), | ||
version=version) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
from collections import OrderedDict | ||
|
||
from arekit.common.experiment.data_type import DataType | ||
|
||
|
||
def create_fixed_folding(train_filenames, dev_filenames, test_filenames, limit=None): | ||
""" Create fixed data-folding based on the predefined list of filenames, | ||
written in file. | ||
""" | ||
assert(isinstance(train_filenames, list)) | ||
assert(isinstance(dev_filenames, list)) | ||
assert(isinstance(test_filenames, list)) | ||
|
||
filenames_by_ids = create_filenames_by_ids(filenames=train_filenames + dev_filenames + test_filenames) | ||
|
||
ids_by_filenames = {} | ||
for doc_id, filename in filenames_by_ids.items(): | ||
ids_by_filenames[filename] = doc_id | ||
|
||
train_filenames = train_filenames if limit is None else train_filenames[:limit] | ||
test_filenames = test_filenames if limit is None else test_filenames[:limit] | ||
dev_filenames = dev_filenames if limit is None else dev_filenames[:limit] | ||
|
||
fixed_folding = { | ||
DataType.Train: [ids_by_filenames[filename] for filename in train_filenames], | ||
DataType.Test: [ids_by_filenames[filename] for filename in test_filenames], | ||
DataType.Dev: [ids_by_filenames[filename] for filename in dev_filenames] | ||
} | ||
|
||
return filenames_by_ids, fixed_folding | ||
|
||
|
||
def create_filenames_by_ids(filenames): | ||
""" Indexing filenames | ||
""" | ||
|
||
def __create_new_id(default_id): | ||
new_id = default_id | ||
while new_id in filenames_by_ids: | ||
new_id += 1 | ||
return new_id | ||
|
||
default_id = 0 | ||
|
||
filenames_by_ids = OrderedDict() | ||
for fname in filenames: | ||
|
||
doc_id = number_from_string(fname) | ||
|
||
if doc_id is None: | ||
doc_id = __create_new_id(default_id) | ||
default_id = doc_id | ||
|
||
assert(doc_id not in filenames_by_ids) | ||
filenames_by_ids[doc_id] = fname | ||
|
||
return filenames_by_ids | ||
|
||
|
||
def number_from_string(s): | ||
assert(isinstance(s, str)) | ||
|
||
digit_chars_prefix = [] | ||
|
||
for chr in s: | ||
if chr.isdigit(): | ||
digit_chars_prefix.append(chr) | ||
else: | ||
break | ||
|
||
if len(digit_chars_prefix) == 0: | ||
return None | ||
|
||
return int("".join(digit_chars_prefix)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
from os import path | ||
|
||
from arekit.common.experiment.data_type import DataType | ||
from arekit.contrib.source.nerel.folding.fixed import create_fixed_folding | ||
from arekit.contrib.source.nerel.utils import iter_filename_and_splittype | ||
from arekit.contrib.source.zip_utils import ZipArchiveUtils | ||
|
||
|
||
class NerelIOUtils(ZipArchiveUtils): | ||
|
||
splits = { | ||
DataType.Train: "train", | ||
DataType.Dev: "dev", | ||
DataType.Test: "test" | ||
} | ||
|
||
@staticmethod | ||
def get_archive_filepath(version): | ||
return path.join(NerelIOUtils.get_data_root(), "nerel-{}.zip".format(version)) | ||
|
||
@staticmethod | ||
def get_annotation_innerpath(folding_data_type, filename): | ||
assert(isinstance(filename, str)) | ||
return path.join(NerelIOUtils.splits[folding_data_type], "{}.ann".format(filename)) | ||
|
||
@staticmethod | ||
def get_news_innerpath(folding_data_type, filename): | ||
assert(isinstance(filename, str)) | ||
return path.join(NerelIOUtils.splits[folding_data_type], "{}.txt".format(filename)) | ||
|
||
@staticmethod | ||
def map_doc_to_fold_type(version): | ||
|
||
it = iter_filename_and_splittype( | ||
filenames_it=NerelIOUtils.iter_filenames_from_zip(version), | ||
splits=NerelIOUtils.splits.items()) | ||
|
||
d2f = {} | ||
for filename, split_type in it: | ||
d2f[filename] = split_type | ||
|
||
return d2f | ||
|
||
@staticmethod | ||
def read_dataset_split(version, docs_limit=None): | ||
|
||
it = iter_filename_and_splittype( | ||
filenames_it=NerelIOUtils.iter_filenames_from_zip(version), | ||
splits=NerelIOUtils.splits.items()) | ||
|
||
f2d = {} | ||
for filename, split_type in it: | ||
if split_type not in f2d: | ||
f2d[split_type] = [] | ||
f2d[split_type].append(filename) | ||
|
||
filenames_by_ids, data_folding = create_fixed_folding(train_filenames=f2d[DataType.Train], | ||
test_filenames=f2d[DataType.Test], | ||
dev_filenames=f2d[DataType.Dev], | ||
limit=docs_limit) | ||
|
||
return filenames_by_ids, data_folding |
Oops, something went wrong.