diff --git a/README.md b/README.md index 541a5974..a45cc7f3 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,8 @@ Please follows th ## Applications +* **AREnets** [[github]](https://github.com/nicolay-r/AREnets) + * is an OpenNRE like project, but the kernel based on tensorflow library, with implementation of neural networks on top of it, designed for Attitude * **ARElight** [[site]](https://nicolay-r.github.io/arelight-page/) [[github]](https://github.com/nicolay-r/ARElight) * **Infer attitudes** from large Mass-media documents or **sample texts** for your Machine Learning models applications diff --git a/arekit/contrib/networks/input/const.py b/arekit/contrib/networks/input/const.py index 888c338a..0f39a327 100644 --- a/arekit/contrib/networks/input/const.py +++ b/arekit/contrib/networks/input/const.py @@ -4,5 +4,6 @@ SynonymObject = "syn_objs" SynonymSubject = "syn_subjs" PosTags = "pos_tags" +Text = "text" ArgsSep = ',' diff --git a/arekit/contrib/networks/input/rows_parser.py b/arekit/contrib/networks/input/rows_parser.py index 9f8a9ba8..10dad525 100644 --- a/arekit/contrib/networks/input/rows_parser.py +++ b/arekit/contrib/networks/input/rows_parser.py @@ -1,24 +1,35 @@ -import pandas as pd - from arekit.common.data import const from arekit.common.utils import filter_whitespaces, split_by_whitespaces -from . import const as network_input_const + +import arekit.contrib.networks.input.const as network_input_const empty_list = [] +def no_value(): + return None + + def __process_values_list(value): return value.split(network_input_const.ArgsSep) def __process_indices_list(value): - return [int(v) for v in str(value).split(network_input_const.ArgsSep)] + return no_value() if not value else [int(v) for v in str(value).split(network_input_const.ArgsSep)] def __process_int_values_list(value): return __process_indices_list(value) +def __handle_text(value): + """ The core method of the input text processing. + """ + assert(isinstance(value, str) or isinstance(value, list)) + return filter_whitespaces([term for term in split_by_whitespaces(value)] + if isinstance(value, str) else value) + + parse_value = { const.ID: lambda value: value, const.DOC_ID: lambda value: int(value), @@ -35,18 +46,19 @@ def __process_int_values_list(value): network_input_const.SynonymObject: lambda value: __process_indices_list(value), network_input_const.SynonymSubject: lambda value: __process_indices_list(value), network_input_const.PosTags: lambda value: __process_int_values_list(value), - "text_a": lambda value: filter_whitespaces([term for term in split_by_whitespaces(value)]) + network_input_const.Text: lambda value: __handle_text(value) } class ParsedSampleRow(object): - """ - Provides a parsed information for a sample row. - TODO. Use this class as API + """ Provides a parsed information for a sample row. """ def __init__(self, row): - assert(isinstance(row, pd.Series)) + """ row: dict + dict of the pairs ("field_name", value) + """ + assert(isinstance(row, dict)) self.__uint_label = None self.__params = {} @@ -64,13 +76,16 @@ def __init__(self, row): self.__params[key] = parse_value[key](value) + def __value_or_none(self, key): + return self.__params[key] if key in self.__params else no_value() + @property def SampleID(self): return self.__params[const.ID] @property def Terms(self): - return self.__params["text_a"] + return self.__params[network_input_const.Text] @property def SubjectIndex(self): @@ -86,33 +101,33 @@ def UintLabel(self): @property def PartOfSpeechTags(self): - return self.__params[network_input_const.PosTags] + return self.__value_or_none(network_input_const.PosTags) @property def TextFrameVariantIndices(self): - return self.__params[network_input_const.FrameVariantIndices] + return self.__value_or_none(network_input_const.FrameVariantIndices) @property def TextFrameConnotations(self): - return self.__params[network_input_const.FrameConnotations] + return self.__value_or_none(network_input_const.FrameConnotations) @property def EntityInds(self): - return self.__params[const.ENTITIES] + return self.__value_or_none(const.ENTITIES) @property def SynonymObjectInds(self): - return self.__params[network_input_const.SynonymObject] + return self.__value_or_none(network_input_const.SynonymObject) @property def SynonymSubjectInds(self): - return self.__params[network_input_const.SynonymSubject] + return self.__value_or_none(network_input_const.SynonymSubject) def __getitem__(self, item): assert (isinstance(item, str) or item is None) if item not in self.__params: - return None - return self.__params[item] if item is not None else None + return no_value() + return self.__params[item] if item is not None else no_value() @classmethod def parse(cls, row): diff --git a/arekit/contrib/source/brat/annot.py b/arekit/contrib/source/brat/annot.py index 959ff2f0..2749e7a0 100644 --- a/arekit/contrib/source/brat/annot.py +++ b/arekit/contrib/source/brat/annot.py @@ -14,33 +14,40 @@ def __non_prefixed_id(value): @staticmethod def handle_entity(args): + """ T2 Location 10 23 South America + T1 Location 0 5;16 23 North America + """ + assert(len(args) == 3) - if len(args) < 4: - return None + e_id = int(BratAnnotationParser.__non_prefixed_id(args[0])) + entity_params = args[1].split() - if not str.isdigit(args[2]) or not str.isdigit(args[3]): + if len(entity_params) > 3: + # We do not support the case of a non-continuous entity mentions. return None - e_id = int(BratAnnotationParser.__non_prefixed_id(args[0])) - e_str_type = args[1] - e_begin = int(args[2]) - e_end = int(args[3]) - e_value = " ".join([arg.strip().replace(',', '') for arg in args[4:]]) + e_str_type, e_begin, e_end = entity_params return BratEntity(id_in_doc=e_id, e_type=e_str_type, - index_begin=e_begin, - index_end=e_end, - value=e_value) + index_begin=int(e_begin), + index_end=int(e_end), + value=args[2].strip()) @staticmethod def handle_relation(args): + """ Example: + R1 Origin Arg1:T3 Arg2:T4 + """ + # Parse identifier index. e_id = args[0][1:] - rel_type = args[1] - source_id = args[2].split(':')[1] - target_id = args[3].split(':')[1] + # Parse relation arguments. + rel_type, source, target = args[1].split() + + source_id = source.split(':')[1] + target_id = target.split(':')[1] return BratRelation(id_in_doc=e_id, source_id=int(BratAnnotationParser.__non_prefixed_id(source_id)), @@ -57,7 +64,7 @@ def parse_annotations(input_file, encoding='utf-8'): for line in input_file.readlines(): line = line.decode(encoding) - args = line.split() + args = line.split('\t') record_type = args[0][0] diff --git a/arekit/contrib/utils/data/readers/jsonl.py b/arekit/contrib/utils/data/readers/jsonl.py new file mode 100644 index 00000000..ca9c4923 --- /dev/null +++ b/arekit/contrib/utils/data/readers/jsonl.py @@ -0,0 +1,15 @@ +from arekit.contrib.utils.data.readers.base import BaseReader +from arekit.contrib.utils.data.storages.jsonl_based import JsonlBasedRowsStorage + + +class JsonlReader(BaseReader): + + def read(self, target): + rows = [] + with open(target, "r") as f: + for line in f.readlines(): + rows.append(line) + return JsonlBasedRowsStorage(rows) + + def target_extension(self): + return ".jsonl" diff --git a/arekit/contrib/utils/data/storages/jsonl_based.py b/arekit/contrib/utils/data/storages/jsonl_based.py new file mode 100644 index 00000000..bc32e269 --- /dev/null +++ b/arekit/contrib/utils/data/storages/jsonl_based.py @@ -0,0 +1,18 @@ +import json + +from arekit.common.data.storages.base import BaseRowsStorage + + +class JsonlBasedRowsStorage(BaseRowsStorage): + + def __init__(self, rows): + assert(isinstance(rows, list)) + self.__rows = rows + + def _iter_rows(self): + for row_index, row in enumerate(self.__rows): + assert(isinstance(row, str)) + yield row_index, json.loads(row) + + def _get_rows_count(self): + return len(self.__rows) diff --git a/arekit/contrib/utils/evaluation/analyze_errors.py b/arekit/contrib/utils/evaluation/analyze_errors.py index 13fba1e6..5d6f96ab 100644 --- a/arekit/contrib/utils/evaluation/analyze_errors.py +++ b/arekit/contrib/utils/evaluation/analyze_errors.py @@ -131,13 +131,12 @@ def extract_errors(eval_result, test_samples_filepath, etalon_samples_filepath, for sample_col in columns_to_copy: eval_errors_df.at[row_id, sample_col] = sample_row[sample_col] - text_terms =__post_text_processing(sample_row=sample_row, source_ind=source_ind, target_ind=target_ind) + text_terms = __post_text_processing(sample_row=sample_row, source_ind=source_ind, target_ind=target_ind) cropped_text = __crop_text_terms(source_ind=source_ind, target_ind=target_ind, text_terms=text_terms) eval_errors_df.at[row_id, BaseSingleTextProvider.TEXT_A] = cropped_text - # Replace with the values instead of indices. - entity_inds = __get_entity_inds(sample_row) + # Replace source and target the values instead of indices. eval_errors_df.at[row_id, const.S_IND] = text_terms[source_ind] eval_errors_df.at[row_id, const.T_IND] = text_terms[target_ind] diff --git a/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py b/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py index c2002254..96101feb 100644 --- a/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +++ b/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py @@ -44,7 +44,6 @@ def create_text_opinion_extraction_pipeline(text_parser, version=version, doc_id_func=lambda doc_id: doc_id, keep_doc_ids_only=False, - label_scaler=label_scaler, limit=limit) doc_ops = DictionaryBasedDocumentOperations(ru_attitudes) diff --git a/arekit/contrib/utils/pipelines/sources/ruattitudes/utils.py b/arekit/contrib/utils/pipelines/sources/ruattitudes/utils.py index 65f3271f..1420856d 100644 --- a/arekit/contrib/utils/pipelines/sources/ruattitudes/utils.py +++ b/arekit/contrib/utils/pipelines/sources/ruattitudes/utils.py @@ -18,7 +18,7 @@ def get_doc(self, doc_id): return self.__ru_attitudes[doc_id] -def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, label_scaler, limit=None): +def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, limit=None): """ Performs reading of RuAttitude formatted documents and selection according to 'doc_ids_set' parameter. """