Skip to content

Commit

Permalink
#507 make it non-static for faster reading (caching doc splits)
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Jul 29, 2023
1 parent 5369f6e commit 4cb1fba
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 15 deletions.
28 changes: 15 additions & 13 deletions arekit/contrib/source/nerel/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,23 @@

class NerelDocReader(object):

@staticmethod
def read_text_relations(folding_type, filename, version):
def __init__(self, version=DEFAULT_VERSION):
self.__version = version
self.__doc_fold = NerelIOUtils.map_doc_to_fold_type(version)

def read_text_relations(self, filename):
assert(isinstance(filename, str))

return NerelIOUtils.read_from_zip(
inner_path=NerelIOUtils.get_annotation_innerpath(folding_data_type=folding_type, filename=filename),
inner_path=NerelIOUtils.get_annotation_innerpath(
folding_data_type=self.__doc_fold[filename],
filename=filename),
process_func=lambda input_file: [
relation for relation in BratAnnotationParser.parse_annotations(
input_file=input_file, encoding='utf-8-sig')["relations"]],
version=version)
version=self.__version)

@staticmethod
def read_document(filename, doc_id, doc_fold=None, version=DEFAULT_VERSION, entities_to_ignore=None):
def read_document(self, filename, doc_id, entities_to_ignore=None):
assert(isinstance(filename, str))
assert(isinstance(doc_id, int))

Expand All @@ -28,14 +32,12 @@ def file_to_doc(input_file):
return BratNews(doc_id=doc_id, sentences=sentences, text_relations=text_relations)

entities = NerelEntityCollection.read_collection(
filename=filename, version=version, entities_to_ignore=entities_to_ignore)

doc_fold = NerelIOUtils.map_doc_to_fold_type(version) if doc_fold is None else doc_fold
filename=filename, version=self.__version, entities_to_ignore=entities_to_ignore)

text_relations = NerelDocReader.read_text_relations(
folding_type=doc_fold[filename], filename=filename, version=version)
text_relations = self.read_text_relations(filename=filename)

return NerelIOUtils.read_from_zip(
inner_path=NerelIOUtils.get_news_innerpath(folding_data_type=doc_fold[filename], filename=filename),
inner_path=NerelIOUtils.get_news_innerpath(
folding_data_type=self.__doc_fold[filename], filename=filename),
process_func=file_to_doc,
version=version)
version=self.__version)
6 changes: 4 additions & 2 deletions tests/contrib/source/test_nerel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
class TestNerelRead(unittest.TestCase):

def test(self):
news = NerelDocReader.read_document(filename="109230_text", doc_id=0)
doc_reader = NerelDocReader()
news = doc_reader.read_document(filename="109230_text", doc_id=0)
assert(isinstance(news, BratNews))
print("Sentences Count:", news.SentencesCount)
for sentence in news.iter_sentences():
Expand All @@ -28,6 +29,7 @@ def test(self):
print(brat_relation.SourceID, brat_relation.TargetID, brat_relation.Type)

def test_all_documents(self):
doc_reader = NerelDocReader()
filenames_by_ids, folding = NerelIOUtils.read_dataset_split()
for doc_id in tqdm(folding.iter_doc_ids(), total=len(list(folding.iter_doc_ids()))):
NerelDocReader.read_document(filename=filenames_by_ids[doc_id], doc_id=0)
doc_reader.read_document(filename=filenames_by_ids[doc_id], doc_id=0)

0 comments on commit 4cb1fba

Please sign in to comment.