From 66980b287eb96fc361140bc81180893e0850bcf2 Mon Sep 17 00:00:00 2001 From: Nicolay Rusnachenko Date: Sat, 7 Jan 2023 10:49:15 +0000 Subject: [PATCH 1/6] Provide AREnets link --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 541a5974..a45cc7f3 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,8 @@ Please follows th ## Applications +* **AREnets** [[github]](https://github.com/nicolay-r/AREnets) + * is an OpenNRE like project, but the kernel based on tensorflow library, with implementation of neural networks on top of it, designed for Attitude * **ARElight** [[site]](https://nicolay-r.github.io/arelight-page/) [[github]](https://github.com/nicolay-r/ARElight) * **Infer attitudes** from large Mass-media documents or **sample texts** for your Machine Learning models applications From 35fd8ae083d01887eaf4cc9dcb71de3170fb632b Mon Sep 17 00:00:00 2001 From: Nicolay Rusnachenko Date: Thu, 12 Jan 2023 12:20:23 +0000 Subject: [PATCH 2/6] #429 related sync --- arekit/contrib/utils/data/readers/jsonl.py | 15 +++++++++++++++ .../contrib/utils/data/storages/jsonl_based.py | 18 ++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 arekit/contrib/utils/data/readers/jsonl.py create mode 100644 arekit/contrib/utils/data/storages/jsonl_based.py diff --git a/arekit/contrib/utils/data/readers/jsonl.py b/arekit/contrib/utils/data/readers/jsonl.py new file mode 100644 index 00000000..ca9c4923 --- /dev/null +++ b/arekit/contrib/utils/data/readers/jsonl.py @@ -0,0 +1,15 @@ +from arekit.contrib.utils.data.readers.base import BaseReader +from arekit.contrib.utils.data.storages.jsonl_based import JsonlBasedRowsStorage + + +class JsonlReader(BaseReader): + + def read(self, target): + rows = [] + with open(target, "r") as f: + for line in f.readlines(): + rows.append(line) + return JsonlBasedRowsStorage(rows) + + def target_extension(self): + return ".jsonl" diff --git a/arekit/contrib/utils/data/storages/jsonl_based.py b/arekit/contrib/utils/data/storages/jsonl_based.py new file mode 100644 index 00000000..bc32e269 --- /dev/null +++ b/arekit/contrib/utils/data/storages/jsonl_based.py @@ -0,0 +1,18 @@ +import json + +from arekit.common.data.storages.base import BaseRowsStorage + + +class JsonlBasedRowsStorage(BaseRowsStorage): + + def __init__(self, rows): + assert(isinstance(rows, list)) + self.__rows = rows + + def _iter_rows(self): + for row_index, row in enumerate(self.__rows): + assert(isinstance(row, str)) + yield row_index, json.loads(row) + + def _get_rows_count(self): + return len(self.__rows) From ea26d166716cfae0c272de3a1d152394f41eca8a Mon Sep 17 00:00:00 2001 From: Nicolay Rusnachenko Date: Thu, 12 Jan 2023 12:21:56 +0000 Subject: [PATCH 3/6] #429 sync. Fix #427 --- arekit/contrib/networks/input/const.py | 1 + arekit/contrib/networks/input/rows_parser.py | 51 +++++++++++++------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/arekit/contrib/networks/input/const.py b/arekit/contrib/networks/input/const.py index 888c338a..0f39a327 100644 --- a/arekit/contrib/networks/input/const.py +++ b/arekit/contrib/networks/input/const.py @@ -4,5 +4,6 @@ SynonymObject = "syn_objs" SynonymSubject = "syn_subjs" PosTags = "pos_tags" +Text = "text" ArgsSep = ',' diff --git a/arekit/contrib/networks/input/rows_parser.py b/arekit/contrib/networks/input/rows_parser.py index 9f8a9ba8..10dad525 100644 --- a/arekit/contrib/networks/input/rows_parser.py +++ b/arekit/contrib/networks/input/rows_parser.py @@ -1,24 +1,35 @@ -import pandas as pd - from arekit.common.data import const from arekit.common.utils import filter_whitespaces, split_by_whitespaces -from . import const as network_input_const + +import arekit.contrib.networks.input.const as network_input_const empty_list = [] +def no_value(): + return None + + def __process_values_list(value): return value.split(network_input_const.ArgsSep) def __process_indices_list(value): - return [int(v) for v in str(value).split(network_input_const.ArgsSep)] + return no_value() if not value else [int(v) for v in str(value).split(network_input_const.ArgsSep)] def __process_int_values_list(value): return __process_indices_list(value) +def __handle_text(value): + """ The core method of the input text processing. + """ + assert(isinstance(value, str) or isinstance(value, list)) + return filter_whitespaces([term for term in split_by_whitespaces(value)] + if isinstance(value, str) else value) + + parse_value = { const.ID: lambda value: value, const.DOC_ID: lambda value: int(value), @@ -35,18 +46,19 @@ def __process_int_values_list(value): network_input_const.SynonymObject: lambda value: __process_indices_list(value), network_input_const.SynonymSubject: lambda value: __process_indices_list(value), network_input_const.PosTags: lambda value: __process_int_values_list(value), - "text_a": lambda value: filter_whitespaces([term for term in split_by_whitespaces(value)]) + network_input_const.Text: lambda value: __handle_text(value) } class ParsedSampleRow(object): - """ - Provides a parsed information for a sample row. - TODO. Use this class as API + """ Provides a parsed information for a sample row. """ def __init__(self, row): - assert(isinstance(row, pd.Series)) + """ row: dict + dict of the pairs ("field_name", value) + """ + assert(isinstance(row, dict)) self.__uint_label = None self.__params = {} @@ -64,13 +76,16 @@ def __init__(self, row): self.__params[key] = parse_value[key](value) + def __value_or_none(self, key): + return self.__params[key] if key in self.__params else no_value() + @property def SampleID(self): return self.__params[const.ID] @property def Terms(self): - return self.__params["text_a"] + return self.__params[network_input_const.Text] @property def SubjectIndex(self): @@ -86,33 +101,33 @@ def UintLabel(self): @property def PartOfSpeechTags(self): - return self.__params[network_input_const.PosTags] + return self.__value_or_none(network_input_const.PosTags) @property def TextFrameVariantIndices(self): - return self.__params[network_input_const.FrameVariantIndices] + return self.__value_or_none(network_input_const.FrameVariantIndices) @property def TextFrameConnotations(self): - return self.__params[network_input_const.FrameConnotations] + return self.__value_or_none(network_input_const.FrameConnotations) @property def EntityInds(self): - return self.__params[const.ENTITIES] + return self.__value_or_none(const.ENTITIES) @property def SynonymObjectInds(self): - return self.__params[network_input_const.SynonymObject] + return self.__value_or_none(network_input_const.SynonymObject) @property def SynonymSubjectInds(self): - return self.__params[network_input_const.SynonymSubject] + return self.__value_or_none(network_input_const.SynonymSubject) def __getitem__(self, item): assert (isinstance(item, str) or item is None) if item not in self.__params: - return None - return self.__params[item] if item is not None else None + return no_value() + return self.__params[item] if item is not None else no_value() @classmethod def parse(cls, row): From fbcb15fb3fec5d973a02db5edb671efd735585eb Mon Sep 17 00:00:00 2001 From: Nicolay Rusnachenko Date: Thu, 12 Jan 2023 12:27:06 +0000 Subject: [PATCH 4/6] refactoring --- arekit/contrib/utils/evaluation/analyze_errors.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arekit/contrib/utils/evaluation/analyze_errors.py b/arekit/contrib/utils/evaluation/analyze_errors.py index 13fba1e6..5d6f96ab 100644 --- a/arekit/contrib/utils/evaluation/analyze_errors.py +++ b/arekit/contrib/utils/evaluation/analyze_errors.py @@ -131,13 +131,12 @@ def extract_errors(eval_result, test_samples_filepath, etalon_samples_filepath, for sample_col in columns_to_copy: eval_errors_df.at[row_id, sample_col] = sample_row[sample_col] - text_terms =__post_text_processing(sample_row=sample_row, source_ind=source_ind, target_ind=target_ind) + text_terms = __post_text_processing(sample_row=sample_row, source_ind=source_ind, target_ind=target_ind) cropped_text = __crop_text_terms(source_ind=source_ind, target_ind=target_ind, text_terms=text_terms) eval_errors_df.at[row_id, BaseSingleTextProvider.TEXT_A] = cropped_text - # Replace with the values instead of indices. - entity_inds = __get_entity_inds(sample_row) + # Replace source and target the values instead of indices. eval_errors_df.at[row_id, const.S_IND] = text_terms[source_ind] eval_errors_df.at[row_id, const.T_IND] = text_terms[target_ind] From 730d535213507dfdeffa1e3d7b903a4d1d778fd0 Mon Sep 17 00:00:00 2001 From: Nicolay Rusnachenko Date: Thu, 12 Jan 2023 12:28:59 +0000 Subject: [PATCH 5/6] #436 fixed --- .../pipelines/sources/ruattitudes/extract_text_opinions.py | 1 - arekit/contrib/utils/pipelines/sources/ruattitudes/utils.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py b/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py index c2002254..96101feb 100644 --- a/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +++ b/arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py @@ -44,7 +44,6 @@ def create_text_opinion_extraction_pipeline(text_parser, version=version, doc_id_func=lambda doc_id: doc_id, keep_doc_ids_only=False, - label_scaler=label_scaler, limit=limit) doc_ops = DictionaryBasedDocumentOperations(ru_attitudes) diff --git a/arekit/contrib/utils/pipelines/sources/ruattitudes/utils.py b/arekit/contrib/utils/pipelines/sources/ruattitudes/utils.py index 65f3271f..1420856d 100644 --- a/arekit/contrib/utils/pipelines/sources/ruattitudes/utils.py +++ b/arekit/contrib/utils/pipelines/sources/ruattitudes/utils.py @@ -18,7 +18,7 @@ def get_doc(self, doc_id): return self.__ru_attitudes[doc_id] -def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, label_scaler, limit=None): +def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, limit=None): """ Performs reading of RuAttitude formatted documents and selection according to 'doc_ids_set' parameter. """ From 395671faf1c788fdf7e2faac41bcf58a252d890b Mon Sep 17 00:00:00 2001 From: Nicolay Rusnachenko Date: Mon, 16 Jan 2023 13:06:40 +0000 Subject: [PATCH 6/6] #437 -- refactored. Improved rows parser. #415 -- removed case of mentioned `,` in value. --- arekit/contrib/source/brat/annot.py | 37 +++++++++++++++++------------ 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/arekit/contrib/source/brat/annot.py b/arekit/contrib/source/brat/annot.py index 959ff2f0..2749e7a0 100644 --- a/arekit/contrib/source/brat/annot.py +++ b/arekit/contrib/source/brat/annot.py @@ -14,33 +14,40 @@ def __non_prefixed_id(value): @staticmethod def handle_entity(args): + """ T2 Location 10 23 South America + T1 Location 0 5;16 23 North America + """ + assert(len(args) == 3) - if len(args) < 4: - return None + e_id = int(BratAnnotationParser.__non_prefixed_id(args[0])) + entity_params = args[1].split() - if not str.isdigit(args[2]) or not str.isdigit(args[3]): + if len(entity_params) > 3: + # We do not support the case of a non-continuous entity mentions. return None - e_id = int(BratAnnotationParser.__non_prefixed_id(args[0])) - e_str_type = args[1] - e_begin = int(args[2]) - e_end = int(args[3]) - e_value = " ".join([arg.strip().replace(',', '') for arg in args[4:]]) + e_str_type, e_begin, e_end = entity_params return BratEntity(id_in_doc=e_id, e_type=e_str_type, - index_begin=e_begin, - index_end=e_end, - value=e_value) + index_begin=int(e_begin), + index_end=int(e_end), + value=args[2].strip()) @staticmethod def handle_relation(args): + """ Example: + R1 Origin Arg1:T3 Arg2:T4 + """ + # Parse identifier index. e_id = args[0][1:] - rel_type = args[1] - source_id = args[2].split(':')[1] - target_id = args[3].split(':')[1] + # Parse relation arguments. + rel_type, source, target = args[1].split() + + source_id = source.split(':')[1] + target_id = target.split(':')[1] return BratRelation(id_in_doc=e_id, source_id=int(BratAnnotationParser.__non_prefixed_id(source_id)), @@ -57,7 +64,7 @@ def parse_annotations(input_file, encoding='utf-8'): for line in input_file.readlines(): line = line.decode(encoding) - args = line.split() + args = line.split('\t') record_type = args[0][0]