mara · leo-schick · Oct 21, 2020 · Oct 27, 2020 · Oct 28, 2020 · Jun 19, 2022
diff --git a/mara_pipelines/commands/files.py b/mara_pipelines/commands/files.py
@@ -1,5 +1,6 @@
 """Commands for reading files"""
 
+import deprecation
 import json
 import pathlib
 import shlex
@@ -9,19 +10,28 @@
 
 import mara_db.dbs
 import mara_db.shell
+import mara_storage.storages
+from mara_storage.shell import read_file_command
 from . import sql
 from mara_page import _, html
 from .. import config, pipelines
+import mara_pipelines
 
 
-class Compression(enum.Enum):
+@deprecation.deprecated(deprecated_in='3.2.0', removed_in='4.0.0',
+                        current_version=mara_pipelines.__version__,
+                        details='Use mara_storage.compression.Compression instead')
+class Compression(enum.EnumMeta):
     """Different compression formats that are understood by file readers"""
     NONE = 'none'
     GZIP = 'gzip'
     TAR_GZIP = 'tar.gzip'
     ZIP = 'zip'
 
 
+@deprecation.deprecated(deprecated_in='3.2.0', removed_in='4.0.0',
+                        current_version=mara_pipelines.__version__,
+                        details='Use mara_storage.compression.uncompressor instead')
 def uncompressor(compression: Compression) -> str:
     """Maps compression methods to command line programs that can unpack the respective files"""
     return {Compression.NONE: 'cat',
@@ -35,7 +45,8 @@ class ReadFile(pipelines.Command):
 
     def __init__(self, file_name: str, compression: Compression, target_table: str,
                  mapper_script_file_name: str = None, make_unique: bool = False,
-                 db_alias: str = None, csv_format: bool = False, skip_header: bool = False,
+                 db_alias: str = None, storage_alias: str = None,
+                 csv_format: bool = False, skip_header: bool = False,
                  delimiter_char: str = None, quote_char: str = None,
                  null_value_string: str = None, timezone: str = None) -> None:
         super().__init__()
@@ -48,6 +59,7 @@ def __init__(self, file_name: str, compression: Compression, target_table: str,
         self.csv_format = csv_format
         self.skip_header = skip_header
         self._db_alias = db_alias
+        self._storage_alias = storage_alias
         self.delimiter_char = delimiter_char
         self.quote_char = quote_char
         self.null_value_string = null_value_string
@@ -56,6 +68,10 @@ def __init__(self, file_name: str, compression: Compression, target_table: str,
     def db_alias(self):
         return self._db_alias or config.default_db_alias()
 
+    @property
+    def storage_alias(self):
+        return self._storage_alias or config.default_storage_alias()
+
     def shell_command(self):
         copy_from_stdin_command = mara_db.shell.copy_from_stdin_command(
             self.db_alias(), csv_format=self.csv_format, target_table=self.target_table,
@@ -64,14 +80,17 @@ def shell_command(self):
             null_value_string=self.null_value_string, timezone=self.timezone)
         if not isinstance(mara_db.dbs.db(self.db_alias()), mara_db.dbs.BigQueryDB):
             return \
-                f'{uncompressor(self.compression)} "{pathlib.Path(config.data_dir()) / self.file_name}" \\\n' \
+                f'{read_file_command(self.storage_alias, file_name=self.file_name, compression=self.compression)} \\\n' \
                 + (f'  | {shlex.quote(sys.executable)} "{self.mapper_file_path()}" \\\n'
                    if self.mapper_script_file_name else '') \
                 + ('  | sort -u \\\n' if self.make_unique else '') \
                 + '  | ' + copy_from_stdin_command
         else:
             # Bigquery loading does not support streaming data through pipes
-            return copy_from_stdin_command + f" {pathlib.Path(config.data_dir()) / self.file_name}"
+            storage = mara_storage.storages.storage(self.storage_alias)
+            if not isinstance(storage, mara_storage.storages.LocalStorage):
+                raise ValueError('The ReadFile to a BigQuery database can only be used from a storage alias of type LocalStorage')
+            return copy_from_stdin_command + f' {shlex.quote(str( (storage.base_path / self.file_name).absolute() ))}'
 
     def mapper_file_path(self):
         return self.parent.parent.base_path() / self.mapper_script_file_name
@@ -86,6 +105,7 @@ def html_doc_items(self) -> [(str, str)]:
                 ('make unique', _.tt[self.make_unique]),
                 ('target_table', _.tt[self.target_table]),
                 ('db alias', _.tt[self.db_alias()]),
+                ('storage alias', _.tt[self.storage_alias]),
                 ('csv format', _.tt[self.csv_format]),
                 ('skip header', _.tt[self.skip_header]),
                 ('delimiter char',
@@ -100,29 +120,41 @@ def html_doc_items(self) -> [(str, str)]:
 class ReadSQLite(sql._SQLCommand):
     def __init__(self, sqlite_file_name: str, target_table: str,
                  sql_statement: str = None, sql_file_name: str = None, replace: {str: str} = None,
-                 db_alias: str = None, timezone: str = None) -> None:
+                 db_alias: str = None, storage_alias: str = None, timezone: str = None) -> None:
+        if not isinstance(mara_storage.storages.storage(storage_alias), mara_storage.storages.LocalStorage):
+            raise ValueError('The ReadSQLite task can only be used from a storage alias of type LocalStorage')
         sql._SQLCommand.__init__(self, sql_statement, sql_file_name, replace)
         self.sqlite_file_name = sqlite_file_name
 
         self.target_table = target_table
         self._db_alias = db_alias
+        self._storage_alias = storage_alias
         self.timezone = timezone
 
     @property
     def db_alias(self):
         return self._db_alias or config.default_db_alias()
 
+    @property
+    def storage_alias(self):
+        return self._storage_alias or config.default_storage_alias()
+
     def shell_command(self):
+        storage = mara_storage.storages.storage(self.storage_alias)
+        if not isinstance(storage, mara_storage.storages.LocalStorage):
+            raise ValueError('The ReadSQLite task can only be used from a storage alias of type LocalStorage')
+
         return (sql._SQLCommand.shell_command(self)
                 + '  | ' + mara_db.shell.copy_command(
-                    mara_db.dbs.SQLiteDB(file_name=config.data_dir().absolute() / self.sqlite_file_name),
+                    mara_db.dbs.SQLiteDB(file_name=(storage.base_path / self.sqlite_file_name).absolute()),
                     self.db_alias, self.target_table, timezone=self.timezone))
 
     def html_doc_items(self) -> [(str, str)]:
         return [('sqlite file name', _.i[self.sqlite_file_name])] \
                + sql._SQLCommand.html_doc_items(self, None) \
                + [('target_table', _.tt[self.target_table]),
                   ('db alias', _.tt[self.db_alias]),
+                  ('storage alias', _.tt[self.storage_alias]),
                   ('time zone', _.tt[self.timezone]),
                   (_.i['shell command'], html.highlight_syntax(self.shell_command(), 'bash'))]
 

diff --git a/mara_pipelines/commands/python.py b/mara_pipelines/commands/python.py
@@ -29,7 +29,7 @@ def __init__(self, function: Callable = None, args: [str] = None, file_dependenc
         self.args = args or []
         self.file_dependencies = file_dependencies or []
 
-    def run(self) -> bool:
+    def run(self, *args, **kargs) -> bool:
         dependency_type = 'RunFunction ' + self.function.__name__
         if self.file_dependencies:
             assert (self.parent)
@@ -78,7 +78,7 @@ def file_name(self):
     def args(self):
         return self._args() if callable(self._args) else self._args
 
-    def run(self) -> bool:
+    def run(self, *args, **kargs) -> bool:
         dependency_type = 'ExecutePython ' + self.file_name
         if self.file_dependencies:
             assert (self.parent)
@@ -89,7 +89,7 @@ def run(self) -> bool:
                 logger.log('no changes')
                 return True
 
-        if not super().run():
+        if not super().run(*args, **kargs):
             return False
 
         if self.file_dependencies:

diff --git a/mara_pipelines/commands/sql.py b/mara_pipelines/commands/sql.py
@@ -102,7 +102,7 @@ def __init__(self, sql_statement: str = None, sql_file_name: Union[str, Callable
     def db_alias(self):
         return self._db_alias or config.default_db_alias()
 
-    def run(self) -> bool:
+    def run(self, *args, **kargs) -> bool:
         if self.sql_file_name:
             logger.log(self.sql_file_name, logger.Format.ITALICS)
 
@@ -124,7 +124,7 @@ def run(self) -> bool:
                 # probably not be there (usually the first step is a DROP).
                 file_dependencies.delete(self.node_path(), dependency_type)
 
-        if not super().run():
+        if not super().run(*args, **kargs):
             return False
 
         if self.file_dependencies:
@@ -167,7 +167,7 @@ def target_db_alias(self):
     def file_path(self) -> pathlib.Path:
         return self.parent.parent.base_path() / self.sql_file_name
 
-    def run(self) -> bool:
+    def run(self, *args, **kargs) -> bool:
         if self.sql_file_name:
             logger.log(self.sql_file_name, logger.Format.ITALICS)
 
@@ -187,7 +187,7 @@ def run(self) -> bool:
                 # (see also above in ExecuteSQL)
                 file_dependencies.delete(self.node_path(), dependency_type)
 
-        if not super().run():
+        if not super().run(*args, **kargs):
             return False
 
         if self.file_dependencies:
@@ -263,15 +263,17 @@ def __init__(self, source_db_alias: str, source_table: str,
     def target_db_alias(self):
         return self._target_db_alias or config.default_db_alias()
 
-    def run(self) -> bool:
+    def run(self, *args, **kargs) -> bool:
+        run_shell_command = kargs['context'].run_shell_command if 'context' in kargs else shell.run_shell_command
+
         # retrieve the highest current value for the modification comparison (e.g.: the highest timestamp)
         # We intentionally use the command line here (rather than sqlalchemy) to avoid forcing people python drivers,
         # which can be hard for example in the case of SQL Server
         logger.log(f'Get new max modification comparison value...', format=logger.Format.ITALICS)
         max_value_query = f'SELECT max({self.modification_comparison}) AS maxval FROM {self.source_table}'
         logger.log(max_value_query, format=logger.Format.VERBATIM)
-        result = shell.run_shell_command(f'echo {shlex.quote(max_value_query)} \\\n  | '
-                                         + mara_db.shell.copy_to_stdout_command(self.source_db_alias))
+        result = run_shell_command(f'echo {shlex.quote(max_value_query)} \\\n  | '
+                                   + mara_db.shell.copy_to_stdout_command(self.source_db_alias))
 
         if not result:
             return False
@@ -323,7 +325,7 @@ def run(self) -> bool:
             # overwrite the comparison criteria to get everything
             replace = {self.comparison_value_placeholder: '(1=1)'}
             complete_copy_command = self._copy_command(self.target_table, replace)
-            if not shell.run_shell_command(complete_copy_command):
+            if not run_shell_command(complete_copy_command):
                 return False
 
         else:
@@ -332,16 +334,16 @@ def run(self) -> bool:
             create_upsert_table_query = (f'DROP TABLE IF EXISTS {self.target_table}_upsert;\n'
                                          + f'CREATE TABLE {self.target_table}_upsert AS SELECT * from {self.target_table} WHERE FALSE')
 
-            if not shell.run_shell_command(f'echo {shlex.quote(create_upsert_table_query)} \\\n  | '
-                                           + mara_db.shell.query_command(self.target_db_alias)):
+            if not run_shell_command(f'echo {shlex.quote(create_upsert_table_query)} \\\n  | '
+                                     + mara_db.shell.query_command(self.target_db_alias)):
                 return False
 
             # perform the actual copy replacing the placeholder
             # with the comparison value from the latest successful execution
             modification_comparison_type = self.modification_comparison_type or ''
             replace = {self.comparison_value_placeholder:
                            f'({self.modification_comparison} >= {modification_comparison_type} \'{last_comparison_value}\')'}
-            if not shell.run_shell_command(self._copy_command(self.target_table + '_upsert', replace)):
+            if not run_shell_command(self._copy_command(self.target_table + '_upsert', replace)):
                 return False
 
             # now the upsert table has to be merged with the target one
@@ -370,11 +372,11 @@ def run(self) -> bool:
 SELECT src.*
 FROM {self.target_table}_upsert src
 WHERE NOT EXISTS (SELECT 1 FROM {self.target_table} dst WHERE {key_definition})"""
-                if not shell.run_shell_command(f'echo {shlex.quote(update_query)} \\\n  | '
-                                               + mara_db.shell.query_command(self.target_db_alias)):
+                if not run_shell_command(f'echo {shlex.quote(update_query)} \\\n  | '
+                                         + mara_db.shell.query_command(self.target_db_alias)):
                     return False
-                elif not shell.run_shell_command(f'echo {shlex.quote(insert_query)} \\\n  | '
-                                                 + mara_db.shell.query_command(self.target_db_alias)):
+                elif not run_shell_command(f'echo {shlex.quote(insert_query)} \\\n  | '
+                                           + mara_db.shell.query_command(self.target_db_alias)):
                     return False
             else:
                 upsery_query = f"""
@@ -383,8 +385,8 @@ def run(self) -> bool:
 FROM {self.target_table}_upsert
 ON CONFLICT ({key_definition})
 DO UPDATE SET {set_clause}"""
-                if not shell.run_shell_command(f'echo {shlex.quote(upsery_query)} \\\n  | '
-                                               + mara_db.shell.query_command(self.target_db_alias)):
+                if not run_shell_command(f'echo {shlex.quote(upsery_query)} \\\n  | '
+                                         + mara_db.shell.query_command(self.target_db_alias)):
                     return False
 
         # update data_integration_incremental_copy_status

diff --git a/mara_pipelines/config.py b/mara_pipelines/config.py
@@ -1,19 +1,30 @@
 """Configuration of data integration pipelines and how to run them"""
 
 import datetime
+import deprecation
 import functools
 import multiprocessing
 import pathlib
 import typing
 
+from mara_app.monkey_patch import patch
+import mara_storage.config
+import mara_storage.storages
+
 from . import pipelines, events
+from .contexts import ExecutionContext
+from .contexts.bash import BashExecutionContext
+import mara_pipelines
 
 
 def root_pipeline() -> 'pipelines.Pipeline':
     """A pipeline that contains all other pipelines of the project"""
     return pipelines.demo_pipeline()
 
 
+@deprecation.deprecated(deprecated_in='3.2.0', removed_in='4.0.0',
+                        current_version=mara_pipelines.__version__,
+                        details='Use mara_storage.config.storages instead')
 def data_dir() -> str:
     """Where to find local data files"""
     return str(pathlib.Path('data').absolute())
@@ -24,6 +35,15 @@ def default_db_alias() -> str:
     return 'dwh-etl'
 
 
+def default_storage_alias() -> str:
+    """The alias of the storage that should be used when not specified otherwise"""
+    return 'data'
+
+@patch(mara_storage.config.storages)
+def storages() -> {str: mara_storage.storages.Storage}:
+    return {'data': mara_storage.storages.LocalStorage(base_path=pathlib.Path(data_dir()))}
+
+
 def default_task_max_retries():
     """How many times a task is retried when it fails by default """
     return 0
@@ -49,6 +69,16 @@ def bash_command_string() -> str:
     return '/usr/bin/env bash -o pipefail'
 
 
+def default_execution_context() -> str:
+    """Sets the default execution context"""
+    return 'bash'
+
+
+def execution_contexts() -> {str: ExecutionContext}:
+    """The available execution contexts"""
+    return {'bash': BashExecutionContext()}
+
+
 def system_statistics_collection_period() -> typing.Union[float, None]:
     """
     How often should system statistics be collected in seconds.