openego · gnn · Mar 23, 2021 · Mar 9, 2021 · Mar 14, 2021 · Mar 14, 2021
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -16,15 +16,23 @@ Added
   displays better error messages than Python's built-in function. Use
   this wrapper wenn calling other programs in Airflow tasks.
 
-* You can now override the default database configuration by putting a
-  "local-database.yaml" into the current working directory. Values read
-  from this file will override the default values. You can also generate
-  this file by specifying command line switches to ``egon-data``. Look
-  for the switches starting with ``--database`` in ``egon-data --help``.
+* You can now override the default database configuration using command
+  line arguments. Look for the switches starting with ``--database`` in
+  ``egon-data --help``. See `PR #159`_ for more details.
 
 * Docker will not be used if there is already a service listening on the
   HOST:PORT combination configured for the database.
 
+* You can now supply values for the command line arguments for
+  ``egon-data`` using a configuration file. If the configuration file
+  doesn't exist, it will be created by ``egon-data`` on it's first run.
+  Note that the configuration file is read from and written to the
+  directtory in which ``egon-data`` is started, so it's probably best to
+  run ``egon-data`` in a dedicated directory.
+  There's also the new function `egon.data.config.settings` which
+  returns the current configuration settings. See `PR #159`_ for more
+  details.
+
 * OSM data import as done in open_ego
   `#1 <https://github.com/openego/eGon-data/issues/1>`_
 * Verwaltungsgebiete data import (vg250) more or less done as in open_ego
@@ -42,6 +50,9 @@ Added
 * Option for running workflow in test mode
   `#112 <https://github.com/openego/eGon-data/issues/112>`_
 
+.. _PR #159: https://github.com/openego/eGon-data/pull/159
+
+
 Changed
 -------
 

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
@@ -9,6 +9,20 @@ little bit helps, and credit will always be given.
 Extending the data workflow
 ===========================
 
+Where to save (downloaded) data?
+--------------------------------
+
+If a task requires to retrieve some data from external sources which needs to
+be saved locally, please use `CWD` to store the data. This is achieved by using
+
+.. code-block:: python
+
+  from pathlib import Path
+  from urllib.request import urlretrieve
+
+  filepath = Path(".") / "filename.csv"
+  urlretrieve("https://url/to/file", filepath)
+
 
 Adjusting test mode data
 ------------------------

diff --git a/docs/troubleshooting.rst b/docs/troubleshooting.rst
@@ -72,6 +72,33 @@ Once you got this, run :code:`kill -s INT NUMBER`, substituting
 :code:`egon-data serve` should run without errors again.
 
 
+``[ERROR] Cannot create container for service egon-data-local-database ...``
+----------------------------------------------------------------------------
+
+During building the docker container for the Postgres database, you might
+encounter an error like
+
+.. code-block:: none
+
+  ERROR: for egon-data-local-database  Cannot create container for service
+  egon-data-local-database: Conflict. The container name
+  "/egon-data-local-database" is already in use by container
+  "1ff9aadef273a76a0acbf850c0da794d0fb28a30e9840f818cca1a47d1181b00".
+  You have to remove (or rename) that container to be able to reuse that name.
+
+If you're ok with deleting the data, stop and remove the container by
+
+.. code-block:: none
+
+  docker stop egon-data-local-database
+  docker rm -v egon-data-local-database
+
+The container and its data can be kept by renaming the docker container.
+
+.. code-block:: none
+
+  docker rename egon-data-local-database NEW_CONTAINER_NAME
+
 Other import or incompatible package version errors
 ===================================================
 

diff --git a/setup.py b/setup.py
@@ -84,12 +84,16 @@ def read(*names, **kwargs):
     install_requires=[
         "apache-airflow<2.0",
         "click",
+        "disaggregator"
+        "@git+https://github.com/openego/disaggregator.git"
+        "@features/pip_install",
+        "geopandas",
+        "importlib_resources",
+        "loguru",
         "oedialect==0.0.8",
-        "pyaml",
         "psycopg2",
+        "pyaml",
         "sqlalchemy",
-        "geopandas",
-        "disaggregator @ git+https://github.com/openego/disaggregator.git@features/pip_install"
         # eg: 'aspectlib==1.1.1', 'six>=1.7',
     ],
     extras_require={

diff --git a/src/egon/data/__init__.py b/src/egon/data/__init__.py
@@ -1 +1,19 @@
+from textwrap import wrap
+
+from loguru import logger
+import click
+
 __version__ = "0.0.0"
+
+
+def echo(message):
+    prefix, message = message.split(" - ")
+    lines = message.split("\n")
+    width = min(72, click.get_terminal_size()[0])
+    wraps = ["\n".join(wrap(line, width)) for line in lines]
+    message = "\n".join([prefix] + wraps)
+    click.echo(message, err=True)
+
+
+logger.remove()
+logger.add(echo, colorize=True)
diff --git a/src/egon/data/airflow/airflow.cfg b/src/egon/data/airflow/airflow.cfg
@@ -1,11 +1,11 @@
 [core]
 # The folder where your airflow pipelines live, most likely a
 # subfolder in a code repository. This path must be absolute.
-# dags_folder = ${AIRFLOW_HOME}
+dags_folder = {dags}
 
 # The folder where airflow should store its log files
 # This path must be absolute
-# base_log_folder = ${AIRFLOW_HOME}/logs
+# base_log_folder = ${{AIRFLOW_HOME}}/logs
 
 # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search.
 # Set this to True if you want to enable remote logging.
@@ -34,17 +34,17 @@ logging_config_class =
 colored_console_log = True
 
 # Log format for when Colored logs is enabled
-colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
+colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {{%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d}} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
 colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter
 
 # Format of Log line
-log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
+log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s
 simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
 
 # Log filename format
-log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log
-log_processor_filename_template = {{ filename }}.log
-# dag_processor_manager_log_location = ${AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log
+log_filename_template = {{{{ ti.dag_id }}}}/{{{{ ti.task_id }}}}/{{{{ ts }}}}/{{{{ try_number }}}}.log
+log_processor_filename_template = {{{{ filename }}}}.log
+# dag_processor_manager_log_location = ${{AIRFLOW_HOME}}/logs/dag_processor_manager/dag_processor_manager.log
 
 # Name of handler to read task instance logs.
 # Default to use task handler.
@@ -66,12 +66,12 @@ default_timezone = utc
 
 # The executor class that airflow should use. Choices include
 # SequentialExecutor, LocalExecutor, CeleryExecutor, DaskExecutor, KubernetesExecutor
-executor = SequentialExecutor
+executor = LocalExecutor
 
 # The SqlAlchemy connection string to the metadata database.
 # SqlAlchemy supports many different database engine, more information
 # their website
-# sql_alchemy_conn = sqlite:///${AIRFLOW_HOME}/airflow.db
+sql_alchemy_conn = postgresql+psycopg2://{--database-user}:{--database-password}@{--database-host}:{--database-port}/{--airflow-database-name}
 
 # The encoding for the databases
 sql_engine_encoding = utf-8
@@ -122,7 +122,7 @@ sql_alchemy_schema =
 parallelism = 32
 
 # The number of task instances allowed to run concurrently by the scheduler
-dag_concurrency = 16
+dag_concurrency = {--jobs}
 
 # Are DAGs paused by default at creation
 dags_are_paused_at_creation = False
@@ -141,7 +141,7 @@ load_examples = False
 load_default_connections = True
 
 # Where your Airflow plugins are stored
-# plugins_folder = ${AIRFLOW_HOME}/plugins
+# plugins_folder = ${{AIRFLOW_HOME}}/plugins
 
 # Secret key to save connection passwords in the db
 fernet_key =
@@ -237,7 +237,7 @@ backend =
 # The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class.
 # See documentation for the secrets backend you are using. JSON is expected.
 # Example for AWS Systems Manager ParameterStore:
-# ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}``
+# ``{{"connections_prefix": "/airflow/connections", "profile_name": "default"}}``
 backend_kwargs =
 
 [cli]
@@ -332,7 +332,7 @@ reload_on_plugin_change = False
 
 # Secret key used to run your flask app
 # It should be as random as possible
-secret_key = {SECRET_KEY}
+secret_key = {{SECRET_KEY}}
 
 # Number of workers to run the Gunicorn web server
 workers = 4
@@ -631,7 +631,7 @@ print_stats_interval = 30
 # ago (in seconds), scheduler is considered unhealthy.
 # This is used by the health check in the "/health" endpoint
 scheduler_health_check_threshold = 30
-# child_process_log_directory = ${AIRFLOW_HOME}/logs/scheduler
+# child_process_log_directory = ${{AIRFLOW_HOME}}/logs/scheduler
 
 # Local task jobs periodically heartbeat to the DB. If the job has
 # not heartbeat in this many seconds, the scheduler will mark the
@@ -765,7 +765,7 @@ hide_sensitive_variable_fields = True
 host =
 
 # Format of the log_id, which is used to query for a given tasks logs
-log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number}
+log_id_template = {{dag_id}}-{{task_id}}-{{execution_date}}-{{try_number}}
 
 # Used to mark the end of a log stream for a task
 end_of_log_mark = end_of_log
@@ -900,7 +900,7 @@ git_sync_root = /git
 git_sync_dest = repo
 
 # Mount point of the volume if git-sync is being used.
-# i.e. ${AIRFLOW_HOME}/dags
+# i.e. ${{AIRFLOW_HOME}}/dags
 git_dags_folder_mount_point =
 
 # To get Git-sync SSH authentication set up follow this format
@@ -1014,7 +1014,7 @@ kube_client_request_args =
 # This should be an object and can contain any of the options listed in the ``v1DeleteOptions``
 # class defined here:
 # https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19
-# Example: delete_option_kwargs = {"grace_period_seconds": 10}
+# Example: delete_option_kwargs = {{"grace_period_seconds": 10}}
 delete_option_kwargs =
 
 # Specifies the uid to run the first process of the worker pods containers as

diff --git a/src/egon/data/airflow/dags/pipeline.py b/src/egon/data/airflow/dags/pipeline.py
@@ -20,16 +20,16 @@
 # Prepare connection to db for operators
 airflow_db_connection()
 
-# Temporary set dataset variable here
-dataset = 'Schleswig-Holstein'
-
 with airflow.DAG(
     "egon-data-processing-pipeline",
     description="The eGo^N data processing DAG.",
     default_args={"start_date": days_ago(1)},
     template_searchpath=[
-        os.path.abspath(os.path.join(os.path.dirname(
-            __file__), '..', '..', 'processing', 'vg250'))
+        os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "..", "processing", "vg250"
+            )
+        )
     ],
     is_paused_upon_creation=False,
     schedule_interval=None,
@@ -40,12 +40,10 @@
     osm_download = PythonOperator(
         task_id="download-osm",
         python_callable=import_osm.download_pbf_file,
-        op_args={dataset},
     )
     osm_import = PythonOperator(
         task_id="import-osm",
         python_callable=import_osm.to_postgres,
-        op_args={dataset},
     )
     osm_migrate = PythonOperator(
         task_id="migrate-osm",
@@ -54,7 +52,6 @@
     osm_add_metadata = PythonOperator(
         task_id="add-osm-metadata",
         python_callable=import_osm.add_metadata,
-        op_args={dataset},
     )
     setup >> osm_download >> osm_import >> osm_migrate >> osm_add_metadata
 
@@ -64,8 +61,8 @@
         python_callable=import_vg250.download_vg250_files,
     )
     vg250_import = PythonOperator(
-        task_id="import-vg250", python_callable=import_vg250.to_postgres,
-        op_args={dataset}
+        task_id="import-vg250",
+        python_callable=import_vg250.to_postgres,
     )
 
     vg250_nuts_mview = PostgresOperator(
@@ -90,29 +87,27 @@
     # Zensus import
     zensus_download_population = PythonOperator(
         task_id="download-zensus-population",
-        python_callable=import_zs.download_zensus_pop
+        python_callable=import_zs.download_zensus_pop,
     )
 
     zensus_download_misc = PythonOperator(
         task_id="download-zensus-misc",
-        python_callable=import_zs.download_zensus_misc
+        python_callable=import_zs.download_zensus_misc,
     )
 
     zensus_tables = PythonOperator(
         task_id="create-zensus-tables",
-        python_callable=import_zs.create_zensus_tables
+        python_callable=import_zs.create_zensus_tables,
     )
 
     population_import = PythonOperator(
         task_id="import-zensus-population",
         python_callable=import_zs.population_to_postgres,
-        op_args={dataset}
     )
 
     zensus_misc_import = PythonOperator(
         task_id="import-zensus-misc",
         python_callable=import_zs.zensus_misc_to_postgres,
-        op_args={dataset}
     )
     setup >> zensus_download_population >> zensus_download_misc
     zensus_download_misc >> zensus_tables >> population_import
@@ -129,35 +124,35 @@
     # Power plant setup
     power_plant_tables = PythonOperator(
         task_id="create-power-plant-tables",
-        python_callable=power_plants.create_tables
+        python_callable=power_plants.create_tables,
     )
     setup >> power_plant_tables
 
-
     # NEP data import
     create_tables = PythonOperator(
         task_id="create-scenario-tables",
-        python_callable=nep_input.create_scenario_input_tables)
+        python_callable=nep_input.create_scenario_input_tables,
+    )
 
     nep_insert_data = PythonOperator(
         task_id="insert-nep-data",
         python_callable=nep_input.insert_data_nep,
-        op_args={dataset})
+    )
 
     setup >> create_tables >> nep_insert_data
     vg250_clean_and_prepare >> nep_insert_data
-
+    population_import >> nep_insert_data
 
     # setting etrago input tables
     etrago_input_data = PythonOperator(
-        task_id = "setting-etrago-input-tables",
-        python_callable = etrago.create_tables
+        task_id="setting-etrago-input-tables",
+        python_callable=etrago.create_tables,
     )
     setup >> etrago_input_data
 
     # Retrieve MaStR data
     retrieve_mastr_data = PythonOperator(
         task_id="retrieve_mastr_data",
-        python_callable=mastr.download_mastr_data
+        python_callable=mastr.download_mastr_data,
     )
     setup >> retrieve_mastr_data