From 5369f6efbc3abdac6f1f675df77746f86a68d6c6 Mon Sep 17 00:00:00 2001 From: Nicolay Rusnachenko Date: Sat, 29 Jul 2023 14:15:16 +0100 Subject: [PATCH] #507 added NEREL collection support --- arekit/contrib/source/download.py | 5 +- arekit/contrib/source/nerel/__init__.py | 0 arekit/contrib/source/nerel/entities.py | 54 +++++++++++ .../contrib/source/nerel/folding/__init__.py | 0 arekit/contrib/source/nerel/folding/fixed.py | 71 ++++++++++++++ arekit/contrib/source/nerel/io_utils.py | 93 +++++++++++++++++++ arekit/contrib/source/nerel/reader.py | 41 ++++++++ tests/contrib/source/test_nerel.py | 33 +++++++ tests/contrib/source/test_sentinerel.py | 2 +- 9 files changed, 297 insertions(+), 2 deletions(-) create mode 100644 arekit/contrib/source/nerel/__init__.py create mode 100644 arekit/contrib/source/nerel/entities.py create mode 100644 arekit/contrib/source/nerel/folding/__init__.py create mode 100644 arekit/contrib/source/nerel/folding/fixed.py create mode 100644 arekit/contrib/source/nerel/io_utils.py create mode 100644 arekit/contrib/source/nerel/reader.py create mode 100644 tests/contrib/source/test_nerel.py diff --git a/arekit/contrib/source/download.py b/arekit/contrib/source/download.py index 36634b0a..7dca4d78 100644 --- a/arekit/contrib/source/download.py +++ b/arekit/contrib/source/download.py @@ -27,7 +27,10 @@ def download(): # SentiNEREL "sentinerel-v1_0.zip": "https://www.dropbox.com/s//sentinerel-v1_0.zip?dl=1", "sentinerel-v2_0.zip": "https://www.dropbox.com/s//sentinerel-v2_0.zip?dl=1", - "sentinerel-v2_1.zip": "https://www.dropbox.com/s//sentinerel-v2_1.zip?dl=1" + "sentinerel-v2_1.zip": "https://www.dropbox.com/s//sentinerel-v2_1.zip?dl=1", + # NEREL + "nerel-v1_0.zip": "https://www.dropbox.com/scl/fi/vegk0aczjdm9km410loqv/nerel-v1_0.zip?rlkey=wv0ut86n3x5ao6xabsaxd7lh7&dl=1", + "nerel-v1_1.zip": "https://www.dropbox.com/scl/fi/oaytj0rvx7vhdxjk98x7g/nerel-v1_1.zip?rlkey=klrq0l5rpn10cf7e2swkay6r4&dl=1" } # Perform downloading ... diff --git a/arekit/contrib/source/nerel/__init__.py b/arekit/contrib/source/nerel/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/arekit/contrib/source/nerel/entities.py b/arekit/contrib/source/nerel/entities.py new file mode 100644 index 00000000..db7272f0 --- /dev/null +++ b/arekit/contrib/source/nerel/entities.py @@ -0,0 +1,54 @@ +from arekit.common.entities.collection import EntityCollection +from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders +from arekit.contrib.source.brat.annot import BratAnnotationParser +from arekit.contrib.source.brat.entities.entity import BratEntity +from arekit.contrib.source.nerel.io_utils import NerelIOUtils +from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper +from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection + + +class NerelEntityCollection(EntityCollection): + + def __init__(self, contents, value_to_group_id_func, entities_to_ignore=None): + """ + entities_to_ignore: list or None + this parameter is required because of the simplified implmentation of + the nested objects of the BRAT annotation. + """ + assert(isinstance(contents, dict)) + assert(BratAnnotationParser.ENTITIES in contents) + assert(isinstance(entities_to_ignore, list) or entities_to_ignore is None) + + self.__dicard_entities = set([] if entities_to_ignore is None else entities_to_ignore) + contents[BratAnnotationParser.ENTITIES] = [e for e in contents[BratAnnotationParser.ENTITIES] + if self.__keep_entity(e)] + + super(NerelEntityCollection, self).__init__( + entities=contents[BratAnnotationParser.ENTITIES], + value_to_group_id_func=value_to_group_id_func) + + self._sort_entities(key=lambda entity: entity.IndexBegin) + + def __keep_entity(self, entity): + assert(isinstance(entity, BratEntity)) + return entity.Type not in self.__dicard_entities + + @classmethod + def read_collection(cls, filename, version, entities_to_ignore=None): + assert(isinstance(filename, str)) + + # Since this dataset does not provide the synonyms collection by default, + # it is necessary to declare an empty collection to populate so in further. + synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False) + + doc_fold = NerelIOUtils.map_doc_to_fold_type(version) + + return NerelIOUtils.read_from_zip( + inner_path=NerelIOUtils.get_annotation_innerpath(folding_data_type=doc_fold[filename], filename=filename), + process_func=lambda input_file: cls( + contents=BratAnnotationParser.parse_annotations(input_file=input_file, encoding='utf-8-sig'), + entities_to_ignore=entities_to_ignore, + value_to_group_id_func=lambda value: + SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value( + synonyms, value)), + version=version) diff --git a/arekit/contrib/source/nerel/folding/__init__.py b/arekit/contrib/source/nerel/folding/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/arekit/contrib/source/nerel/folding/fixed.py b/arekit/contrib/source/nerel/folding/fixed.py new file mode 100644 index 00000000..f0ba7e82 --- /dev/null +++ b/arekit/contrib/source/nerel/folding/fixed.py @@ -0,0 +1,71 @@ +from collections import OrderedDict + +from arekit.common.experiment.data_type import DataType +from arekit.common.folding.fixed import FixedFolding + + +def create_fixed_folding(train_filenames, dev_filenames, test_filenames): + """ Create fixed data-folding based on the predefined list of filenames, + written in file. + """ + assert(isinstance(train_filenames, list)) + assert(isinstance(dev_filenames, list)) + assert(isinstance(test_filenames, list)) + + filenames_by_ids = create_filenames_by_ids(filenames=train_filenames + dev_filenames + test_filenames) + + ids_by_filenames = {} + for doc_id, filename in filenames_by_ids.items(): + ids_by_filenames[filename] = doc_id + + fixed_folding = FixedFolding.from_parts({ + DataType.Train: [ids_by_filenames[filename] for filename in train_filenames], + DataType.Test: [ids_by_filenames[filename] for filename in test_filenames], + DataType.Dev: [ids_by_filenames[filename] for filename in dev_filenames] + }) + + return filenames_by_ids, fixed_folding + + +def create_filenames_by_ids(filenames): + """ Indexing filenames + """ + + def __create_new_id(default_id): + new_id = default_id + while new_id in filenames_by_ids: + new_id += 1 + return new_id + + default_id = 0 + + filenames_by_ids = OrderedDict() + for fname in filenames: + + doc_id = number_from_string(fname) + + if doc_id is None: + doc_id = __create_new_id(default_id) + default_id = doc_id + + assert(doc_id not in filenames_by_ids) + filenames_by_ids[doc_id] = fname + + return filenames_by_ids + + +def number_from_string(s): + assert(isinstance(s, str)) + + digit_chars_prefix = [] + + for chr in s: + if chr.isdigit(): + digit_chars_prefix.append(chr) + else: + break + + if len(digit_chars_prefix) == 0: + return None + + return int("".join(digit_chars_prefix)) diff --git a/arekit/contrib/source/nerel/io_utils.py b/arekit/contrib/source/nerel/io_utils.py new file mode 100644 index 00000000..65f5c7a7 --- /dev/null +++ b/arekit/contrib/source/nerel/io_utils.py @@ -0,0 +1,93 @@ +import enum +from os import path +from os.path import basename + +from arekit.common.experiment.data_type import DataType +from arekit.contrib.source.nerel.folding.fixed import create_fixed_folding +from arekit.contrib.source.zip_utils import ZipArchiveUtils + + +class NerelVersions(enum.Enum): + """ List of the supported version of this collection + """ + + V1 = "v1_0" + V11 = "v1_1" + + +DEFAULT_VERSION = NerelVersions.V1 + + +class NerelIOUtils(ZipArchiveUtils): + + splits = { + DataType.Train: "train", + DataType.Dev: "dev", + DataType.Test: "test" + } + + @staticmethod + def get_archive_filepath(version): + return path.join(NerelIOUtils.get_data_root(), "nerel-{}.zip".format(version)) + + @staticmethod + def get_annotation_innerpath(folding_data_type, filename): + assert(isinstance(filename, str)) + return path.join(NerelIOUtils.splits[folding_data_type], "{}.ann".format(filename)) + + @staticmethod + def get_news_innerpath(folding_data_type, filename): + assert(isinstance(filename, str)) + return path.join(NerelIOUtils.splits[folding_data_type], "{}.txt".format(filename)) + + @staticmethod + def __iter_filenames_from_dataset(version): + assert(isinstance(version, enum.Enum)) + + for filename in NerelIOUtils.iter_filenames_from_zip(version): + + extension = filename[-4:] + + # Crop extension. + filename = filename[:-4] + + if extension != ".txt": + continue + + yield filename, basename(filename) + + @staticmethod + def __iter_filename_and_splittype(version): + filenames_it = NerelIOUtils.__iter_filenames_from_dataset(version=version) + for doc_id, data in enumerate(filenames_it): + filepath, filename = data + for split_type, split_name in NerelIOUtils.splits.items(): + if split_name in filepath: + yield filename, split_type + + @staticmethod + def iter_collection_filenames(version=DEFAULT_VERSION): + filenames_it = NerelIOUtils.__iter_filenames_from_dataset(version=version) + for doc_id, filename in enumerate(filenames_it): + yield doc_id, filename + + @staticmethod + def map_doc_to_fold_type(version=DEFAULT_VERSION): + d2f = {} + for filename, split_type in NerelIOUtils.__iter_filename_and_splittype(version): + d2f[filename] = split_type + return d2f + + @staticmethod + def read_dataset_split(version=DEFAULT_VERSION): + f2d = {} + for filename, split_type in NerelIOUtils.__iter_filename_and_splittype(version): + if split_type not in f2d: + f2d[split_type] = [] + f2d[split_type].append(filename) + + filenames_by_ids, data_folding = create_fixed_folding(train_filenames=f2d[DataType.Train], + test_filenames=f2d[DataType.Test], + dev_filenames=f2d[DataType.Dev]) + + return filenames_by_ids, data_folding \ No newline at end of file diff --git a/arekit/contrib/source/nerel/reader.py b/arekit/contrib/source/nerel/reader.py new file mode 100644 index 00000000..7338b746 --- /dev/null +++ b/arekit/contrib/source/nerel/reader.py @@ -0,0 +1,41 @@ +from arekit.contrib.source.brat.annot import BratAnnotationParser +from arekit.contrib.source.brat.news import BratNews +from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader +from arekit.contrib.source.nerel.entities import NerelEntityCollection +from arekit.contrib.source.nerel.io_utils import NerelIOUtils, DEFAULT_VERSION + + +class NerelDocReader(object): + + @staticmethod + def read_text_relations(folding_type, filename, version): + assert(isinstance(filename, str)) + + return NerelIOUtils.read_from_zip( + inner_path=NerelIOUtils.get_annotation_innerpath(folding_data_type=folding_type, filename=filename), + process_func=lambda input_file: [ + relation for relation in BratAnnotationParser.parse_annotations( + input_file=input_file, encoding='utf-8-sig')["relations"]], + version=version) + + @staticmethod + def read_document(filename, doc_id, doc_fold=None, version=DEFAULT_VERSION, entities_to_ignore=None): + assert(isinstance(filename, str)) + assert(isinstance(doc_id, int)) + + def file_to_doc(input_file): + sentences = BratDocumentSentencesReader.from_file(input_file=input_file, entities=entities) + return BratNews(doc_id=doc_id, sentences=sentences, text_relations=text_relations) + + entities = NerelEntityCollection.read_collection( + filename=filename, version=version, entities_to_ignore=entities_to_ignore) + + doc_fold = NerelIOUtils.map_doc_to_fold_type(version) if doc_fold is None else doc_fold + + text_relations = NerelDocReader.read_text_relations( + folding_type=doc_fold[filename], filename=filename, version=version) + + return NerelIOUtils.read_from_zip( + inner_path=NerelIOUtils.get_news_innerpath(folding_data_type=doc_fold[filename], filename=filename), + process_func=file_to_doc, + version=version) diff --git a/tests/contrib/source/test_nerel.py b/tests/contrib/source/test_nerel.py new file mode 100644 index 00000000..50c42e62 --- /dev/null +++ b/tests/contrib/source/test_nerel.py @@ -0,0 +1,33 @@ +import unittest + +from tqdm import tqdm + +from arekit.contrib.source.brat.news import BratNews +from arekit.contrib.source.brat.relation import BratRelation +from arekit.contrib.source.brat.sentence import BratSentence +from arekit.contrib.source.nerel.io_utils import NerelIOUtils +from arekit.contrib.source.nerel.reader import NerelDocReader + + +class TestNerelRead(unittest.TestCase): + + def test(self): + news = NerelDocReader.read_document(filename="109230_text", doc_id=0) + assert(isinstance(news, BratNews)) + print("Sentences Count:", news.SentencesCount) + for sentence in news.iter_sentences(): + assert(isinstance(sentence, BratSentence)) + print(sentence.Text.strip()) + for entity, bound in sentence.iter_entity_with_local_bounds(): + print("{}: ['{}',{}, {}]".format( + entity.ID, entity.Value, entity.Type, + "-".join([str(bound.Position), str(bound.Position+bound.Length)]))) + + for brat_relation in news.Relations: + assert(isinstance(brat_relation, BratRelation)) + print(brat_relation.SourceID, brat_relation.TargetID, brat_relation.Type) + + def test_all_documents(self): + filenames_by_ids, folding = NerelIOUtils.read_dataset_split() + for doc_id in tqdm(folding.iter_doc_ids(), total=len(list(folding.iter_doc_ids()))): + NerelDocReader.read_document(filename=filenames_by_ids[doc_id], doc_id=0) diff --git a/tests/contrib/source/test_sentinerel.py b/tests/contrib/source/test_sentinerel.py index 4b570684..67cb726b 100644 --- a/tests/contrib/source/test_sentinerel.py +++ b/tests/contrib/source/test_sentinerel.py @@ -6,7 +6,7 @@ from arekit.contrib.source.sentinerel.reader import SentiNerelDocReader -class TestRead(unittest.TestCase): +class TestSentiNERELRead(unittest.TestCase): def test(self): news = SentiNerelDocReader.read_document(filename="2070_text", doc_id=0)