From e67a355aece0fe43e6c5dccaaa3926041bb1b536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stephan=20G=C3=BCnther?= Date: Wed, 19 Oct 2022 02:24:03 +0200 Subject: [PATCH 1/5] Add a failing test for issue #985 In order to reproduce the issue, the test creates two `Dataset`s ending in parallel tasks and sets their `__module__` attribute to a module below the `egon.data.datasets` package to simulate having the `Dataset`s created there. Then the test checks that the ids of the final `Dataset` tasks are distinct. --- tests/test_dataset_class.py | 45 +++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 tests/test_dataset_class.py diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py new file mode 100644 index 000000000..a6590f367 --- /dev/null +++ b/tests/test_dataset_class.py @@ -0,0 +1,45 @@ +from dataclasses import dataclass +from typing import Union + +from airflow.models.dag import DAG + +from egon.data.datasets import Dataset, TaskGraph, Tasks + + +def test_uniqueness_of_automatically_generated_final_dataset_task(): + """Test that the generated final dataset task is named uniquely. + + This is a regression test for issue #985. Having multiple `Dataset`s ending + in parallel tasks doesn't work if those `Dataset`s are in a module below + the `egon.data.datasets` package. In that case the code removing the module + name prefix from task ids and the code generating the final dataset task + which updates the dataset version once all parallel tasks have finished + interact in a way that generates non-distinct task ids so that tasks + generated later clobber the ones generated earlier. This leads to spurious + cycles and other inconsistencies and bugs in the graph. + """ + + noops = [(lambda: None) for _ in range(4)] + for i, noop in enumerate(noops): + noop.__name__ = f"noop-{i}" + + @dataclass + class Dataset_1(Dataset): + name: str = "DS1" + version: str = "0.0.0" + tasks: Union[Tasks, TaskGraph] = ({noops[0], noops[1]},) + + @dataclass + class Dataset_2(Dataset): + name: str = "DS2" + version: str = "0.0.0" + tasks: Union[Tasks, TaskGraph] = ({noops[2], noops[3]},) + + Dataset_1.__module__ = "egon.data.datasets.test.datasets" + Dataset_2.__module__ = "egon.data.datasets.test.datasets" + with DAG(dag_id="Test-DAG", default_args={"start_date": "1111-11-11"}): + datasets = [Dataset_1(), Dataset_2()] + ids = [list(dataset.tasks)[-1] for dataset in datasets] + assert ( + ids[0] != ids[1] + ), "Expected unique names for final tasks of distinct datasets." From e438a0862aea50d26337cc87d703723d548da5c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stephan=20G=C3=BCnther?= Date: Wed, 19 Oct 2022 02:35:30 +0200 Subject: [PATCH 2/5] Change `--strict` to `--strict-markers` for pytest Running the tests warned about `--strict` being deprecated and told me to use `--strict-markers` instead, so that's what I did. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 9ab69ba91..499e4de4b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,7 +25,7 @@ python_files = tests.py addopts = -ra - --strict + --strict-markers --ignore=docs/conf.py --ignore=setup.py --ignore=ci From 8f5f6c424e810848dd48ef18e15b264f715d9609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stephan=20G=C3=BCnther?= Date: Wed, 19 Oct 2022 03:23:03 +0200 Subject: [PATCH 3/5] Create unique `task_id`s for generated tasks Done by always using the `Dataset`'s name as a prefix to the `task_id`. To be extra sure, the module name is now also always used, even if it's not a sub-module of `egon.data.datasets`. --- src/egon/data/datasets/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/egon/data/datasets/__init__.py b/src/egon/data/datasets/__init__.py index bd4429019..c45e0d470 100644 --- a/src/egon/data/datasets/__init__.py +++ b/src/egon/data/datasets/__init__.py @@ -230,7 +230,7 @@ def __post_init__(self): # Explicitly create single final task, because we can't know # which of the multiple tasks finishes last. name = prefix(self) - name = name if name else f"{self.name}." + name = f"{name if name else f'{self.__module__}.'}{self.name}." update_version = PythonOperator( task_id=f"{name}update-version", # Do nothing, because updating will be added later. From 9ca4e4d9101f83fdda961480e7c752a994b47402 Mon Sep 17 00:00:00 2001 From: "Julian.Endres" Date: Tue, 1 Nov 2022 22:40:08 +0100 Subject: [PATCH 4/5] Add Changelog entry --- CHANGELOG.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 992ea0d26..bd2e9b2f6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -615,6 +615,8 @@ Bug Fixes `#909 `_ * Overwrite capacities for conventional power plants with data from nep list `#403 `_ +* Create unique ids on generated tasks to avoid circular flows in pipeline + `#986 `_ .. _PR #692: https://github.com/openego/eGon-data/pull/692 .. _#343: https://github.com/openego/eGon-data/issues/343 From eafaea37e9f180230259b542d76b2b637c60f404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stephan=20G=C3=BCnther?= Date: Tue, 24 Jan 2023 19:30:44 +0100 Subject: [PATCH 5/5] Change "CHANGELOG.rst" entry The URL used to link to the corresponding issue was incorrect. It linked to the PR instead of the issue. This works because GH doesn't really distinguish between PRs and issues and automatically rewrites URLs according to what the number points to. But it still wasn't clear whether the link target was supposed to be the PR or the issue. Now both are linked, external links are used for readability and the wording is a tad more verbose and friendly (IMHO). --- CHANGELOG.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index bd2e9b2f6..ec49824d6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -615,11 +615,13 @@ Bug Fixes `#909 `_ * Overwrite capacities for conventional power plants with data from nep list `#403 `_ -* Create unique ids on generated tasks to avoid circular flows in pipeline - `#986 `_ +* Automatically generated tasks now get unique :code:`task_id`\s. + Fixes issue `#985`_ via PR `#986`_. .. _PR #692: https://github.com/openego/eGon-data/pull/692 .. _#343: https://github.com/openego/eGon-data/issues/343 .. _#556: https://github.com/openego/eGon-data/issues/556 .. _#641: https://github.com/openego/eGon-data/issues/641 .. _#669: https://github.com/openego/eGon-data/issues/669 +.. _#985: https://github.com/openego/eGon-data/issues/985 +.. _#986: https://github.com/openego/eGon-data/pull/986