-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
- Loading branch information
Showing
13 changed files
with
337 additions
and
63 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from os.path import basename | ||
|
||
|
||
def __iter_filtered_filenames(filenames_iter): | ||
for filename in filenames_iter: | ||
extension = filename[-4:] | ||
# Crop extension. | ||
filename = filename[:-4] | ||
if extension != ".txt": | ||
continue | ||
yield filename, basename(filename) | ||
|
||
|
||
def iter_filename_and_splittype(filenames_it, splits): | ||
for doc_id, data in enumerate(__iter_filtered_filenames(filenames_it)): | ||
filepath, filename = data | ||
for split_type, split_name in splits: | ||
if split_name in filepath: | ||
yield filename, split_type | ||
|
||
|
||
def iter_collection_filenames(filenames_it): | ||
for doc_id, filename in enumerate(__iter_filtered_filenames(filenames_it)): | ||
yield doc_id, filename |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import enum | ||
|
||
|
||
class NerelVersions(enum.Enum): | ||
""" List of the supported version of this collection | ||
""" | ||
|
||
V1 = "v1_0" | ||
V11 = "v1_1" | ||
|
||
|
||
DEFAULT_VERSION = NerelVersions.V1 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
from os import path | ||
|
||
from arekit.common.experiment.data_type import DataType | ||
from arekit.contrib.source.nerel.folding.fixed import create_fixed_folding | ||
from arekit.contrib.source.nerel.io_utils import NerelIOUtils | ||
from arekit.contrib.source.nerel.utils import iter_filename_and_splittype | ||
|
||
|
||
class NerelBioIOUtils(NerelIOUtils): | ||
|
||
splits = { | ||
DataType.Train: "train", | ||
DataType.Dev: "dev", | ||
DataType.Test: "test" | ||
} | ||
|
||
@staticmethod | ||
def get_archive_filepath(version): | ||
return path.join(NerelBioIOUtils.get_data_root(), "nerel-bio-{}.zip".format(version)) | ||
|
||
@staticmethod | ||
def get_annotation_innerpath(folding_data_type, filename): | ||
assert(isinstance(filename, str)) | ||
return path.join(NerelBioIOUtils.splits[folding_data_type], "{}.ann".format(filename)) | ||
|
||
@staticmethod | ||
def get_news_innerpath(folding_data_type, filename): | ||
assert(isinstance(filename, str)) | ||
return path.join(NerelBioIOUtils.splits[folding_data_type], "{}.txt".format(filename)) | ||
|
||
@staticmethod | ||
def map_doc_to_fold_type(version): | ||
|
||
it = iter_filename_and_splittype( | ||
filenames_it=NerelBioIOUtils.iter_filenames_from_zip(version), | ||
splits=NerelBioIOUtils.splits.items()) | ||
|
||
d2f = {} | ||
for filename, split_type in it: | ||
d2f[filename] = split_type | ||
|
||
return d2f | ||
|
||
@staticmethod | ||
def read_dataset_split(version, docs_limit=None): | ||
|
||
it = iter_filename_and_splittype( | ||
filenames_it=NerelBioIOUtils.iter_filenames_from_zip(version), | ||
splits=NerelBioIOUtils.splits.items()) | ||
|
||
f2d = {} | ||
for filename, split_type in it: | ||
if split_type not in f2d: | ||
f2d[split_type] = [] | ||
f2d[split_type].append(filename) | ||
|
||
filenames_by_ids, data_folding = create_fixed_folding(train_filenames=f2d[DataType.Train], | ||
test_filenames=f2d[DataType.Test], | ||
dev_filenames=f2d[DataType.Dev], | ||
limit=docs_limit) | ||
|
||
return filenames_by_ids, data_folding |
Oops, something went wrong.