Skip to content

Commit

Permalink
#507 added NEREL collection support
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Jul 29, 2023
1 parent 2691a46 commit 5369f6e
Show file tree
Hide file tree
Showing 9 changed files with 297 additions and 2 deletions.
5 changes: 4 additions & 1 deletion arekit/contrib/source/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ def download():
# SentiNEREL
"sentinerel-v1_0.zip": "https://www.dropbox.com/s/<HIDDEN>/sentinerel-v1_0.zip?dl=1",
"sentinerel-v2_0.zip": "https://www.dropbox.com/s/<HIDDEN>/sentinerel-v2_0.zip?dl=1",
"sentinerel-v2_1.zip": "https://www.dropbox.com/s/<HIDDEN>/sentinerel-v2_1.zip?dl=1"
"sentinerel-v2_1.zip": "https://www.dropbox.com/s/<HIDDEN>/sentinerel-v2_1.zip?dl=1",
# NEREL
"nerel-v1_0.zip": "https://www.dropbox.com/scl/fi/vegk0aczjdm9km410loqv/nerel-v1_0.zip?rlkey=wv0ut86n3x5ao6xabsaxd7lh7&dl=1",
"nerel-v1_1.zip": "https://www.dropbox.com/scl/fi/oaytj0rvx7vhdxjk98x7g/nerel-v1_1.zip?rlkey=klrq0l5rpn10cf7e2swkay6r4&dl=1"
}

# Perform downloading ...
Expand Down
Empty file.
54 changes: 54 additions & 0 deletions arekit/contrib/source/nerel/entities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from arekit.common.entities.collection import EntityCollection
from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
from arekit.contrib.source.brat.annot import BratAnnotationParser
from arekit.contrib.source.brat.entities.entity import BratEntity
from arekit.contrib.source.nerel.io_utils import NerelIOUtils
from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection


class NerelEntityCollection(EntityCollection):

def __init__(self, contents, value_to_group_id_func, entities_to_ignore=None):
"""
entities_to_ignore: list or None
this parameter is required because of the simplified implmentation of
the nested objects of the BRAT annotation.
"""
assert(isinstance(contents, dict))
assert(BratAnnotationParser.ENTITIES in contents)
assert(isinstance(entities_to_ignore, list) or entities_to_ignore is None)

self.__dicard_entities = set([] if entities_to_ignore is None else entities_to_ignore)
contents[BratAnnotationParser.ENTITIES] = [e for e in contents[BratAnnotationParser.ENTITIES]
if self.__keep_entity(e)]

super(NerelEntityCollection, self).__init__(
entities=contents[BratAnnotationParser.ENTITIES],
value_to_group_id_func=value_to_group_id_func)

self._sort_entities(key=lambda entity: entity.IndexBegin)

def __keep_entity(self, entity):
assert(isinstance(entity, BratEntity))
return entity.Type not in self.__dicard_entities

@classmethod
def read_collection(cls, filename, version, entities_to_ignore=None):
assert(isinstance(filename, str))

# Since this dataset does not provide the synonyms collection by default,
# it is necessary to declare an empty collection to populate so in further.
synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False)

doc_fold = NerelIOUtils.map_doc_to_fold_type(version)

return NerelIOUtils.read_from_zip(
inner_path=NerelIOUtils.get_annotation_innerpath(folding_data_type=doc_fold[filename], filename=filename),
process_func=lambda input_file: cls(
contents=BratAnnotationParser.parse_annotations(input_file=input_file, encoding='utf-8-sig'),
entities_to_ignore=entities_to_ignore,
value_to_group_id_func=lambda value:
SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
synonyms, value)),
version=version)
Empty file.
71 changes: 71 additions & 0 deletions arekit/contrib/source/nerel/folding/fixed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from collections import OrderedDict

from arekit.common.experiment.data_type import DataType
from arekit.common.folding.fixed import FixedFolding


def create_fixed_folding(train_filenames, dev_filenames, test_filenames):
""" Create fixed data-folding based on the predefined list of filenames,
written in file.
"""
assert(isinstance(train_filenames, list))
assert(isinstance(dev_filenames, list))
assert(isinstance(test_filenames, list))

filenames_by_ids = create_filenames_by_ids(filenames=train_filenames + dev_filenames + test_filenames)

ids_by_filenames = {}
for doc_id, filename in filenames_by_ids.items():
ids_by_filenames[filename] = doc_id

fixed_folding = FixedFolding.from_parts({
DataType.Train: [ids_by_filenames[filename] for filename in train_filenames],
DataType.Test: [ids_by_filenames[filename] for filename in test_filenames],
DataType.Dev: [ids_by_filenames[filename] for filename in dev_filenames]
})

return filenames_by_ids, fixed_folding


def create_filenames_by_ids(filenames):
""" Indexing filenames
"""

def __create_new_id(default_id):
new_id = default_id
while new_id in filenames_by_ids:
new_id += 1
return new_id

default_id = 0

filenames_by_ids = OrderedDict()
for fname in filenames:

doc_id = number_from_string(fname)

if doc_id is None:
doc_id = __create_new_id(default_id)
default_id = doc_id

assert(doc_id not in filenames_by_ids)
filenames_by_ids[doc_id] = fname

return filenames_by_ids


def number_from_string(s):
assert(isinstance(s, str))

digit_chars_prefix = []

for chr in s:
if chr.isdigit():
digit_chars_prefix.append(chr)
else:
break

if len(digit_chars_prefix) == 0:
return None

return int("".join(digit_chars_prefix))
93 changes: 93 additions & 0 deletions arekit/contrib/source/nerel/io_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import enum
from os import path
from os.path import basename

from arekit.common.experiment.data_type import DataType
from arekit.contrib.source.nerel.folding.fixed import create_fixed_folding
from arekit.contrib.source.zip_utils import ZipArchiveUtils


class NerelVersions(enum.Enum):
""" List of the supported version of this collection
"""

V1 = "v1_0"
V11 = "v1_1"


DEFAULT_VERSION = NerelVersions.V1


class NerelIOUtils(ZipArchiveUtils):

splits = {
DataType.Train: "train",
DataType.Dev: "dev",
DataType.Test: "test"
}

@staticmethod
def get_archive_filepath(version):
return path.join(NerelIOUtils.get_data_root(), "nerel-{}.zip".format(version))

@staticmethod
def get_annotation_innerpath(folding_data_type, filename):
assert(isinstance(filename, str))
return path.join(NerelIOUtils.splits[folding_data_type], "{}.ann".format(filename))

@staticmethod
def get_news_innerpath(folding_data_type, filename):
assert(isinstance(filename, str))
return path.join(NerelIOUtils.splits[folding_data_type], "{}.txt".format(filename))

@staticmethod
def __iter_filenames_from_dataset(version):
assert(isinstance(version, enum.Enum))

for filename in NerelIOUtils.iter_filenames_from_zip(version):

extension = filename[-4:]

# Crop extension.
filename = filename[:-4]

if extension != ".txt":
continue

yield filename, basename(filename)

@staticmethod
def __iter_filename_and_splittype(version):
filenames_it = NerelIOUtils.__iter_filenames_from_dataset(version=version)
for doc_id, data in enumerate(filenames_it):
filepath, filename = data
for split_type, split_name in NerelIOUtils.splits.items():
if split_name in filepath:
yield filename, split_type

@staticmethod
def iter_collection_filenames(version=DEFAULT_VERSION):
filenames_it = NerelIOUtils.__iter_filenames_from_dataset(version=version)
for doc_id, filename in enumerate(filenames_it):
yield doc_id, filename

@staticmethod
def map_doc_to_fold_type(version=DEFAULT_VERSION):
d2f = {}
for filename, split_type in NerelIOUtils.__iter_filename_and_splittype(version):
d2f[filename] = split_type
return d2f

@staticmethod
def read_dataset_split(version=DEFAULT_VERSION):
f2d = {}
for filename, split_type in NerelIOUtils.__iter_filename_and_splittype(version):
if split_type not in f2d:
f2d[split_type] = []
f2d[split_type].append(filename)

filenames_by_ids, data_folding = create_fixed_folding(train_filenames=f2d[DataType.Train],
test_filenames=f2d[DataType.Test],
dev_filenames=f2d[DataType.Dev])

return filenames_by_ids, data_folding
41 changes: 41 additions & 0 deletions arekit/contrib/source/nerel/reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from arekit.contrib.source.brat.annot import BratAnnotationParser
from arekit.contrib.source.brat.news import BratNews
from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
from arekit.contrib.source.nerel.entities import NerelEntityCollection
from arekit.contrib.source.nerel.io_utils import NerelIOUtils, DEFAULT_VERSION


class NerelDocReader(object):

@staticmethod
def read_text_relations(folding_type, filename, version):
assert(isinstance(filename, str))

return NerelIOUtils.read_from_zip(
inner_path=NerelIOUtils.get_annotation_innerpath(folding_data_type=folding_type, filename=filename),
process_func=lambda input_file: [
relation for relation in BratAnnotationParser.parse_annotations(
input_file=input_file, encoding='utf-8-sig')["relations"]],
version=version)

@staticmethod
def read_document(filename, doc_id, doc_fold=None, version=DEFAULT_VERSION, entities_to_ignore=None):
assert(isinstance(filename, str))
assert(isinstance(doc_id, int))

def file_to_doc(input_file):
sentences = BratDocumentSentencesReader.from_file(input_file=input_file, entities=entities)
return BratNews(doc_id=doc_id, sentences=sentences, text_relations=text_relations)

entities = NerelEntityCollection.read_collection(
filename=filename, version=version, entities_to_ignore=entities_to_ignore)

doc_fold = NerelIOUtils.map_doc_to_fold_type(version) if doc_fold is None else doc_fold

text_relations = NerelDocReader.read_text_relations(
folding_type=doc_fold[filename], filename=filename, version=version)

return NerelIOUtils.read_from_zip(
inner_path=NerelIOUtils.get_news_innerpath(folding_data_type=doc_fold[filename], filename=filename),
process_func=file_to_doc,
version=version)
33 changes: 33 additions & 0 deletions tests/contrib/source/test_nerel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import unittest

from tqdm import tqdm

from arekit.contrib.source.brat.news import BratNews
from arekit.contrib.source.brat.relation import BratRelation
from arekit.contrib.source.brat.sentence import BratSentence
from arekit.contrib.source.nerel.io_utils import NerelIOUtils
from arekit.contrib.source.nerel.reader import NerelDocReader


class TestNerelRead(unittest.TestCase):

def test(self):
news = NerelDocReader.read_document(filename="109230_text", doc_id=0)
assert(isinstance(news, BratNews))
print("Sentences Count:", news.SentencesCount)
for sentence in news.iter_sentences():
assert(isinstance(sentence, BratSentence))
print(sentence.Text.strip())
for entity, bound in sentence.iter_entity_with_local_bounds():
print("{}: ['{}',{}, {}]".format(
entity.ID, entity.Value, entity.Type,
"-".join([str(bound.Position), str(bound.Position+bound.Length)])))

for brat_relation in news.Relations:
assert(isinstance(brat_relation, BratRelation))
print(brat_relation.SourceID, brat_relation.TargetID, brat_relation.Type)

def test_all_documents(self):
filenames_by_ids, folding = NerelIOUtils.read_dataset_split()
for doc_id in tqdm(folding.iter_doc_ids(), total=len(list(folding.iter_doc_ids()))):
NerelDocReader.read_document(filename=filenames_by_ids[doc_id], doc_id=0)
2 changes: 1 addition & 1 deletion tests/contrib/source/test_sentinerel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from arekit.contrib.source.sentinerel.reader import SentiNerelDocReader


class TestRead(unittest.TestCase):
class TestSentiNERELRead(unittest.TestCase):

def test(self):
news = SentiNerelDocReader.read_document(filename="2070_text", doc_id=0)
Expand Down

0 comments on commit 5369f6e

Please sign in to comment.