diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ee9038afe..d3aae4cc1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: rev: 3.8.1 hooks: - id: flake8 - additional_dependencies: [flake8-isort] + language_version: python3 - repo: https://github.com/thclark/pre-commit-sphinx rev: 0.0.1 @@ -42,7 +42,6 @@ repos: hooks: - id: check-branch-name args: - - '^master$' - '^main$' - '^development$' - '^devops/([a-z][a-z0-9]*)(-[a-z0-9]+)*$' diff --git a/octue/__init__.py b/octue/__init__.py index a4d99cafc..01c882a12 100644 --- a/octue/__init__.py +++ b/octue/__init__.py @@ -1,2 +1,5 @@ -from .cli import octue_cli, octue_run, octue_version, pass_analysis # noqa: F401 -from .runner import LOG_FORMAT, Runner # noqa: F401 +from .cli import octue_cli +from .runner import LOG_FORMAT, Runner + + +__all__ = "LOG_FORMAT", "octue_cli", "Runner" diff --git a/octue/cli.py b/octue/cli.py index d83991685..f8c02f40b 100644 --- a/octue/cli.py +++ b/octue/cli.py @@ -1,20 +1,13 @@ -import importlib import os import sys -from functools import update_wrapper import click +import pkg_resources - -FOLDERS = ( - "configuration", - "input", - "log", - "tmp", - "output", -) +from octue.definitions import FOLDER_DEFAULTS, MANIFEST_FILENAME, VALUES_FILENAME +from octue.runner import Runner -@click.group() +@click.group(context_settings={"help_option_names": ["-h", "--help"]}) @click.option( "--id", default=None, @@ -43,146 +36,79 @@ show_default=True, help="Forces a reset of analysis cache and outputs [For future use, currently not implemented]", ) +@click.version_option(version=pkg_resources.get_distribution("octue").version) +@click.pass_context +def octue_cli(ctx, id, skip_checks, log_level, force_reset): + """ Octue CLI, enabling a data service / digital twin to be run like a command line application. + + When acting in CLI mode, results are read from and written to disk (see + https://octue-python-sdk.readthedocs.io/en/latest/ for how to run your application directly without the CLI). + Once your application has run, you'll be able to find output values and manifest in your specified --output-dir. + """ + # TODO Forward command line options to runner via ctx + ctx.ensure_object(dict) + + +@octue_cli.command() @click.option( - "--configuration-values", + "--app-dir", type=click.Path(), - default="/input/input_values.json", + default=".", show_default=True, - help="Source for configuration_values strand data.", + help="Directory containing your source code (app.py)", ) @click.option( - "--configuration-manifest", + "--data-dir", type=click.Path(), - default="/input/input_values.json", + default=".", show_default=True, - help="Source for configuration_manifest strand data.", + help="Location of directories containing configuration values and manifest, input values and manifest, and output " + "directory.", ) @click.option( - "--input-values", + "--config-dir", type=click.Path(), - default="/input/input_values.json", + default=None, show_default=True, - help="Source for input_values strand data.", + help="Directory containing configuration (overrides --data-dir).", ) @click.option( - "--input-manifest", + "--input-dir", type=click.Path(), - default="/input/input_manifest.json", + default=None, show_default=True, - help="Source for input_manifest strand data.", + help="Directory containing input (overrides --data-dir).", ) @click.option( "--output-dir", type=click.Path(), - default="output", - show_default=False, - help="Directory to write outputs as files.", + default=None, + show_default=True, + help="Directory to write outputs as files (overrides --data-dir).", ) @click.option( - "--log-dir", type=click.Path(), default="logs", show_default=True, help="Path to the location of log files", + "--twine", type=click.Path(), default="twine.json", show_default=True, help="Location of Twine file.", ) -@click.pass_context -def octue_cli( - ctx, - id, - skip_checks, - log_level, - force_reset, - configuration_values, - configuration_manifest, - input_values, - input_manifest, - data_dir, - input_dir, - tmp_dir, - output_dir, - log_dir, -): - """ Octue CLI, enabling a data service / digital twin to be run like a command line application. - - Provide sources of configuration and/or input data and run the app. A source can be: - - - A path (relative or absolute) to a directory containing a .json file (eg `path/to/dir`). - - A path to a .json file (eg `path/to/configuration_values.json`). - - A literal JSON string (eg `{"n_iterations": 10}`. - - """ - - # We want to show meaningful defaults in the CLI help but unfortunately have to strip out the displayed values here - if input_values.startswith("/"): - input_dir = None # noqa - if log_dir.startswith("/"): - log_dir = None # noqa - if output_dir.startswith("/"): - output_dir = None # noqa - - ctx.ensure_object(dict) - ctx.obj["analysis"] = "VIBRATION" - - -def pass_analysis(f): - @click.pass_context - def new_func(ctx, *args, **kwargs): - return ctx.invoke(f, ctx.obj["analysis"], *args, **kwargs) - - return update_wrapper(new_func, f) - - -def octue_run(f): - """ Decorator for the main `run` function which adds a command to the CLI and prepares analysis ready for the run - """ - - @octue_cli.command() - @pass_analysis - def run(*args, **kwargs): - return f(*args, **kwargs) - - return update_wrapper(run, f) - - -def octue_version(f): - """ Decorator for the main `version` function which adds a command to the CLI - """ - - @octue_cli.command() - def version(*args, **kwargs): - return f(*args, **kwargs) - - return update_wrapper(version, f) - - -def unwrap(fcn): - """ Recurse through wrapping to get the raw function without decorators. - """ - if hasattr(fcn, "__wrapped__"): - return unwrap(fcn.__wrapped__) - return fcn - - -class AppFrom: - """ Context manager that allows us to temporarily add an app's location to the system path and - extract its run function - - with AppFrom('/path/to/dir') as app: - Runner().run(app) - - """ - - def __init__(self, app_path="."): - self.app_path = os.path.abspath(os.path.normpath(app_path)) - self.app_module = None - - def __enter__(self): - sys.path.insert(0, self.app_path) - self.app_module = importlib.import_module("app") - return self - - def __exit__(self, exc_type, exc_value, traceback): - if self.app_path in sys.path: - sys.path.remove(self.app_path) - - @property - def run(self): - """ Returns the unwrapped run function from app.py in the application's root directory - """ - return unwrap(self.app_module.run) +def run(app_dir, data_dir, config_dir, input_dir, output_dir, twine): + config_dir = config_dir or os.path.join(data_dir, FOLDER_DEFAULTS["configuration"]) + input_dir = input_dir or os.path.join(data_dir, FOLDER_DEFAULTS["input"]) + output_dir = output_dir or os.path.join(data_dir, FOLDER_DEFAULTS["output"]) + + runner = Runner( + twine=twine, + configuration_values=os.path.join(config_dir, VALUES_FILENAME), + configuration_manifest=os.path.join(config_dir, MANIFEST_FILENAME), + ) + analysis = runner.run( + app_src=app_dir, + input_values=os.path.join(input_dir, VALUES_FILENAME), + input_manifest=os.path.join(input_dir, MANIFEST_FILENAME), + output_manifest_path=os.path.join(output_dir, MANIFEST_FILENAME), + ) + analysis.finalise(output_dir=output_dir) + return 0 + + +if __name__ == "__main__": + args = sys.argv[1:] if len(sys.argv) > 1 else [] + octue_cli(args) diff --git a/octue/definitions.py b/octue/definitions.py new file mode 100644 index 000000000..d91ceb3eb --- /dev/null +++ b/octue/definitions.py @@ -0,0 +1,26 @@ +FOLDER_DEFAULTS = { + "configuration": "configuration", + "input": "input", + "tmp": "tmp", + "output": "output", +} + +VALUES_FILENAME = "values.json" + +MANIFEST_FILENAME = "manifest.json" + +STRAND_FILENAME_MAP = { + "configuration_values": VALUES_FILENAME, + "configuration_manifest": MANIFEST_FILENAME, + "input_values": VALUES_FILENAME, + "input_manifest": MANIFEST_FILENAME, + "output_values": VALUES_FILENAME, + "output_manifest": MANIFEST_FILENAME, +} + + +# TODO this should probably be defined in twined +OUTPUT_STRANDS = ("output_values", "output_manifest") + +# TODO this should probably be defined in twined +RUN_STRANDS = ("input_values", "input_manifest", "credentials", "children") diff --git a/octue/mixins/__init__.py b/octue/mixins/__init__.py index c629acd36..0f3fb513c 100644 --- a/octue/mixins/__init__.py +++ b/octue/mixins/__init__.py @@ -1,4 +1,9 @@ -from .identifiable import Identifiable # noqa: F401 -from .loggable import Loggable # noqa: F401 -from .serialisable import Serialisable # noqa: F401 -from .taggable import Taggable # noqa:F401 +from .base import MixinBase +from .identifiable import Identifiable +from .loggable import Loggable +from .pathable import Pathable +from .serialisable import Serialisable +from .taggable import Taggable + + +__all__ = "Identifiable", "Loggable", "MixinBase", "Pathable", "Serialisable", "Taggable" diff --git a/octue/mixins/base.py b/octue/mixins/base.py new file mode 100644 index 000000000..b82e78c63 --- /dev/null +++ b/octue/mixins/base.py @@ -0,0 +1,22 @@ +class MixinBase: + """ Allows you to use any combination of mixin classes which pass unused *args and **kwargs up to their superclass, + without encountering: + ``` + Error + Traceback (most recent call last): + super().__init__(*args, **kwargs) + TypeError: object.__init__() takes no parameters + ``` + on initialisation. + + The case of passing extra args and kwargs to your class constructor which aren't popped by the various + mixins is still handled safely with the same error (to prevent loss of variables and poorly defined behaviour + later). + """ + + def __init__(self, *args, **kwargs): + """ Constructor for ResourceBase + """ + if (len(args) > 0) or (len(kwargs.keys()) > 0): + raise TypeError("object.__init__() takes no parameters") + super().__init__() diff --git a/octue/mixins/identifiable.py b/octue/mixins/identifiable.py index d53e3b448..94e96eb9a 100644 --- a/octue/mixins/identifiable.py +++ b/octue/mixins/identifiable.py @@ -56,7 +56,3 @@ def __repr__(self): @property def id(self): return self._id - - @id.setter - def id(self, value): - raise InvalidInputException(f"You cannot set the id of an already-instantiated {self.__class__.__name__}") diff --git a/octue/mixins/loggable.py b/octue/mixins/loggable.py index 143ca950c..6e30b71a5 100644 --- a/octue/mixins/loggable.py +++ b/octue/mixins/loggable.py @@ -4,6 +4,9 @@ class Loggable: """ Mixin to allow instantiation of a class with a logger, or by default use the module logger from that class + The attached logger is a class variable, so all Resources of the same type inheriting from Loggable will share the + same logger instance; this can be confusing if you overload __init_logger__ in multiple different ways. + ``` class MyResource(Logged): def do_something(self): diff --git a/octue/mixins/pathable.py b/octue/mixins/pathable.py new file mode 100644 index 000000000..1b1839097 --- /dev/null +++ b/octue/mixins/pathable.py @@ -0,0 +1,94 @@ +import os + +from octue.exceptions import InvalidInputException + + +class Pathable: + """ Mixin class to enable resources to get their path location from an owner. + + For example, datasets can get their path from the Manifest they belong to. + + """ + + def __init__(self, *args, path=None, path_from=None, base_from=None, **kwargs): + """ Constructor for pathable mixin + """ + super().__init__(*args, **kwargs) + + if (path_from is not None) and not isinstance(path_from, Pathable): + raise InvalidInputException( + "paths_from argument must be an instance of an object inheriting from Pathable() mixin" + ) + + if (base_from is not None) and not isinstance(base_from, Pathable): + raise InvalidInputException( + "base_from argument must be an instance of an object inheriting from Pathable() mixin" + ) + + self._path_from = path_from + self._base_from = base_from + self._path_is_absolute = False + self.path = path + + @property + def _base_path(self): + """ Gets the absolute path of the base_from object, from which any relative paths are constructed + :return: + :rtype: + """ + if self._base_from: + return self._base_from.absolute_path + + return os.getcwd() + + @property + def _path_prefix(self): + """ Gets the path prefix (this is the absolute_path of the owner path_from object). + Defaults to the current working directory + """ + if self._path_from: + return self._path_from.absolute_path + + if self._path_is_absolute: + return "" + + return os.getcwd() + + @property + def absolute_path(self): + """ The absolute path of this resource + """ + return os.path.normpath(os.path.join(self._path_prefix, self._path)) + + @property + def relative_path(self): + """ The path of this resource relative to its base path + """ + return os.path.relpath(self.absolute_path, self._base_path) + + @property + def path(self): + """ The path of this resource + """ + return self._path + + @path.setter + def path(self, value): + """ Set the path of this resource. + + :param value: Path of the resource. If the resource was instantiated with a `path_from` object, this path must + be relative. Otherwise, absolute paths are acceptable. + :type value: Union[str, path-like] + """ + + value = os.path.normpath(value or ".") + + path_is_absolute = value == os.path.abspath(value) + + if path_is_absolute and self._path_from is not None: + raise InvalidInputException( + f"You cannot an absolute path on a pathable instantiated with 'path_from'. Set a path relative to the path_from object ({self._path_from})" + ) + + self._path_is_absolute = path_is_absolute + self._path = value diff --git a/octue/mixins/serialisable.py b/octue/mixins/serialisable.py index 2c5ada272..fe8bee821 100644 --- a/octue/mixins/serialisable.py +++ b/octue/mixins/serialisable.py @@ -13,10 +13,11 @@ class Serialisable: def __init__(self, *args, **kwargs): """ Constructor for serialisable mixin """ - # Ensure it passes construction argumnets up the chain + # Ensure it passes construction arguments up the chain super().__init__(*args, **kwargs) _serialise_fields = None + _exclude_serialise_fields = ("logger",) def to_file(self, file_name, **kwargs): """ Write to a JSON file @@ -53,12 +54,11 @@ def __init__(self): """ self.logger.debug("Serialising %s %s", self.__class__.__name__, self.id) - # Get all non-private and non-protected attributes except for 'logger' + # Get all non-private and non-protected attributes except those excluded specifically attrs_to_serialise = self._serialise_fields or ( - k - for k in self.__dir__() - if ((k[:1] != "_") and (k != "logger") and (type(getattr(self, k, "")).__name__ != "method")) + k for k in self.__dir__() if ((k[:1] != "_") and (type(getattr(self, k, "")).__name__ != "method")) ) + attrs_to_serialise = (attr for attr in attrs_to_serialise if attr not in self._exclude_serialise_fields) self_as_primitive = {attr: getattr(self, attr, None) for attr in attrs_to_serialise} # TODO this conversion backward-and-forward is very inefficient but allows us to use the same encoder for diff --git a/octue/mixins/taggable.py b/octue/mixins/taggable.py index 29ecf0d94..4d736bd15 100644 --- a/octue/mixins/taggable.py +++ b/octue/mixins/taggable.py @@ -19,6 +19,9 @@ class TagGroup: def __init__(self, tags): """ Construct a TagGroup """ + + # TODO Call the superclass with *args anad **kwargs, then update everything to using ResourceBase + if tags is None: tags = [] @@ -53,7 +56,7 @@ def _clean(tags): cleaned_tag = tag.strip() if not re.match(TAG_PATTERN, cleaned_tag): raise InvalidTagException( - "Tags must contain only characters 'a-z', '0-9', ':' and '-'. They must not start with '-' or ':'." + f"Invalid tag '{cleaned_tag}'. Tags must contain only characters 'a-z', '0-9', ':' and '-'. They must not start with '-' or ':'." ) cleaned_tags.append(cleaned_tag) @@ -99,10 +102,10 @@ class Taggable: """ A mixin class allowing objects to be tagged """ - def __init__(self, tags=None, **kwargs): + def __init__(self, *args, tags=None, **kwargs): """ Constructor for Taggable mixins """ - super().__init__(**kwargs) + super().__init__(*args, **kwargs) self._tags = TagGroup(tags) def add_tags(self, *args): diff --git a/octue/resources/__init__.py b/octue/resources/__init__.py index 9ac42953b..02e234214 100644 --- a/octue/resources/__init__.py +++ b/octue/resources/__init__.py @@ -1,4 +1,7 @@ -from .analysis import Analysis # noqa: F401 -from .datafile import Datafile # noqa: F401 -from .dataset import Dataset # noqa: F401 -from .manifest import Manifest # noqa: F401 +from .analysis import Analysis +from .datafile import Datafile +from .dataset import Dataset +from .manifest import Manifest + + +__all__ = "Analysis", "Datafile", "Dataset", "Manifest" diff --git a/octue/resources/analysis.py b/octue/resources/analysis.py index 56a128027..a1333318f 100644 --- a/octue/resources/analysis.py +++ b/octue/resources/analysis.py @@ -1,8 +1,12 @@ +import json import logging +from octue.definitions import OUTPUT_STRANDS from octue.exceptions import ProtectedAttributeException from octue.mixins import Identifiable, Loggable, Serialisable, Taggable from octue.resources.manifest import Manifest +from octue.utils.encoders import OctueJSONEncoder +from octue.utils.folders import get_file_name_from_strand from twined import ALL_STRANDS, Twine @@ -55,8 +59,8 @@ def __init__(self, twine, **kwargs): self.twine = twine # Pop any possible strand data sources before init superclasses (and tie them to protected attributes) - strand_kwargs = dict((name, kwargs.pop(name, None)) for name in ALL_STRANDS) - for strand_name, strand_data in strand_kwargs.items(): + strand_kwargs = ((name, kwargs.pop(name, None)) for name in ALL_STRANDS) + for strand_name, strand_data in strand_kwargs: self.__setattr__(f"_{strand_name}", strand_data) # Init superclasses @@ -76,4 +80,42 @@ def __getattr__(self, name): themselves shouldn't be changed after instantiation) """ if name in ALL_STRANDS: - return getattr(self, f"_{name}") + return getattr(self, f"_{name}", None) + + def finalise(self, output_dir=None): + """ Validates and serialises output_values and output_manifest, optionally writing them to files + + If output_dir is given, then the serialised outputs are also written to files in the output directory + + :parameter output_dir: path-like pointing to directory where the outputs should be saved to file (if None, files + are not written) + :type output_dir: path-like + + :return: dictionary of serialised strings for values and manifest data. + :rtype: dict + """ + + # Using twined's validate_strand method gives us sugar to check for both extra outputs + # (e.g. output_values where there shouldn't be any) and missing outputs (e.g. output_values is None when it + # should be a dict of data) + serialised = dict() + for k in OUTPUT_STRANDS: + self.logger.debug(f"Serialising {k}") + att = getattr(self, k) + if att is not None: + att = json.dumps(att, cls=OctueJSONEncoder) + + serialised[k] = att + + self.logger.debug("Validating serialised output json against twine") + self.twine.validate(**serialised) + + # Optionally write the serialised strand to disk + for k in OUTPUT_STRANDS: + if output_dir and serialised[k] is not None: + file_name = get_file_name_from_strand(k, output_dir) + self.logger.debug(f"Writing {k} to file {file_name}") + with open(file_name, "w") as fp: + fp.write(serialised[k]) + + return serialised diff --git a/octue/resources/datafile.py b/octue/resources/datafile.py index 1c49c0692..36542a501 100644 --- a/octue/resources/datafile.py +++ b/octue/resources/datafile.py @@ -4,20 +4,20 @@ import time from octue.exceptions import FileNotFoundException, InvalidInputException -from octue.mixins import Identifiable, Loggable, Serialisable, Taggable +from octue.mixins import Identifiable, Loggable, Pathable, Serialisable, Taggable from octue.utils import isfile module_logger = logging.getLogger(__name__) -class Datafile(Taggable, Serialisable, Loggable, Identifiable): +class Datafile(Taggable, Serialisable, Pathable, Loggable, Identifiable): """ Class for representing data files on the Octue system Files in a manifest look like this: { - "path": "input/datasets/7ead7669/file_1.csv", + "path": "folder/subfolder/file_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", @@ -29,12 +29,15 @@ class Datafile(Taggable, Serialisable, Loggable, Identifiable): "sha-512/256": "somesha" }, - :parameter local_path_prefix: A path, specific to the present local system, to the directory containing the dataset - in which this file resides. Default is the current working directory. - :type local_path_prefix: str + :parameter path_from: The root Pathable object (typically a Dataset) that this Datafile's path is relative to. + :type path_from: Pathable - :parameter path: The path of this file, relative to the local_path_prefix (which may have a folder structure within it) - :type path: str + :parameter base_from: A Pathable object, which in most circumstances is the same as the path_from object, upon which + the `relative_path` property is based (if not given, `relative_path` is relative to current working directory + + :parameter path: The path of this file, which may include folders or subfolders, within the dataset. If no path_from + parameter is set, then absolute paths are acceptable, otherwise relative paths are required. + :type path: Union[str, path-like] :parameter logger: A logger instance to which operations with this datafile will be logged. Defaults to the module logger. :type logger: logging.Logger @@ -56,12 +59,15 @@ class Datafile(Taggable, Serialisable, Loggable, Identifiable): :type posix_timestamp: number """ + _exclude_serialise_fields = ("logger", "open") + def __init__( self, id=None, logger=None, - local_path_prefix=".", path=None, + path_from=None, + base_from=None, cluster=0, sequence=None, tags=None, @@ -71,7 +77,7 @@ def __init__( ): """ Construct a datafile """ - super().__init__(id=id, logger=logger, tags=tags) + super().__init__(id=id, logger=logger, tags=tags, path=path, path_from=path_from, base_from=base_from) self.cluster = cluster @@ -79,15 +85,7 @@ def __init__( self.posix_timestamp = posix_timestamp or time.time() if path is None: - raise InvalidInputException("You must supply a valid 'path' argument") - else: - path = str(path).lstrip("\\/") - - # Strip to ensure path is always expressed as relative - self.path = path - - # Replace current directory specifier in the prefix with an absolute path - self.local_path_prefix = str(os.path.abspath(local_path_prefix)) + raise InvalidInputException("You must supply a valid 'path' for a Datafile") # Set up the file extension or get it from the file path if none passed self.extension = self._get_extension_from_path() @@ -106,7 +104,7 @@ def _get_sha_256(self): """ Calculate the SHA256 hash string of the file """ sha256_hash = hashlib.sha256() - with open(self.full_path, "rb") as f: + with open(self.absolute_path, "rb") as f: # Read and update hash string value in blocks of 4K for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) @@ -117,17 +115,13 @@ def _get_sha_256(self): def name(self): return str(os.path.split(self.path)[-1]) - @property - def full_path(self): - return os.path.join(self.local_path_prefix, self.path) - @property def last_modified(self): - return os.path.getmtime(self.full_path) + return os.path.getmtime(self.absolute_path) @property def size_bytes(self): - return os.path.getsize(self.full_path) + return os.path.getsize(self.absolute_path) @property def sha_256(self): @@ -146,9 +140,51 @@ def check(self, size_bytes=None, sha=None, last_modified=None, extension=None): ) if not self.exists(): - raise FileNotFoundException(f"No file found at {self.full_path}") + raise FileNotFoundException(f"No file found at {self.absolute_path}") def exists(self): """ Returns true if the datafile exists on the current system, false otherwise """ - return isfile(self.full_path) + return isfile(self.absolute_path) + + @property + def open(self): + """ Context manager to handle the opening and closing of a Datafile. + + If opened in write mode, the manager will attempt to determine if the folder path exists and, if not, will + create the folder structure required to write the file. + + Use it like: + ``` + my_datafile = Datafile(path='subfolder/subsubfolder/my_datafile.json) + with my_datafile.open('w') as fp: + fp.write("{}") + ``` + This is equivalent to the standard python: + ``` + my_datafile = Datafile(path='subfolder/subsubfolder/my_datafile.json) + os.makedirs(os.path.split(my_datafile.absolute_path)[0], exist_ok=True) + with open(my_datafile.absolute_path, 'w') as fp: + fp.write("{}") + ``` + """ + + absolute_path = self.absolute_path + + class DataFileContextManager: + def __init__(obj, mode="r", **kwargs): + obj.mode = mode + obj.kwargs = kwargs + obj.absolute_path = absolute_path + if "w" in obj.mode: + os.makedirs(os.path.split(obj.absolute_path)[0], exist_ok=True) + + def __enter__(obj): + obj.fp = open(obj.absolute_path, obj.mode, **obj.kwargs) + return obj.fp + + def __exit__(obj, *args): + if obj.fp is not None: + obj.fp.close() + + return DataFileContextManager diff --git a/octue/resources/dataset.py b/octue/resources/dataset.py index b21cdfadd..f97f82649 100644 --- a/octue/resources/dataset.py +++ b/octue/resources/dataset.py @@ -1,31 +1,38 @@ import logging -import os from octue.exceptions import BrokenSequenceException, InvalidInputException, UnexpectedNumberOfResultsException -from octue.mixins import Identifiable, Loggable, Serialisable, Taggable +from octue.mixins import Identifiable, Loggable, Pathable, Serialisable, Taggable from octue.resources.datafile import Datafile -from octue.utils import isfolder module_logger = logging.getLogger(__name__) -class Dataset(Taggable, Serialisable, Loggable, Identifiable): +class Dataset(Taggable, Serialisable, Pathable, Loggable, Identifiable): """ A representation of a dataset, containing files, tags, etc This is used to read a list of files (and their associated properties) into octue analysis, or to compile a list of output files (results) and their properties that will be sent back to the octue system. """ - def __init__(self, id=None, logger=None, tags=None, **kwargs): + def __init__(self, id=None, logger=None, path=None, path_from=None, base_from=None, tags=None, **kwargs): """ Construct a Dataset """ - super().__init__(id=id, logger=logger, tags=tags) - self.files = kwargs.pop("files", list()) - self.__dict__.update(**kwargs) + super().__init__(id=id, logger=logger, tags=tags, path=path, path_from=path_from, base_from=base_from) + + # TODO The decoders aren't being used; utils.decoders.OctueJSONDecoder should be used in twined + # so that resources get automatically instantiated. + # Add a proper `decoder` argument to the load_json utility in twined so that datasets, datafiles and manifests + # get initialised properly, then remove this hackjob. + files = kwargs.pop("files", list()) + self.files = [] + for fi in files: + if isinstance(fi, Datafile): + self.files.append(fi) + else: + self.files.append(Datafile(**fi, path_from=self, base_from=self)) - # TODO A much better way than relying on the current directory! - self._path = os.path.abspath(f"./dataset-{self.id}") + self.__dict__.update(**kwargs) def append(self, *args, **kwargs): """ Add a data/results file to the manifest @@ -37,7 +44,6 @@ def append(self, *args, **kwargs): # or more simply my_manifest.append(**{...}) which implicitly creates the datafile from the starred list of input arguments - TODO allow for appending a list of datafiles """ if len(args) > 1: # Recurse to allow addition of many files at once @@ -87,27 +93,26 @@ def get_files(self, field_lookup, files=None, filter_value=None): # Frequent error of typing only a single underscore causes no results to be returned... catch it if "__" not in field_lookup: - raise InvalidInputException("Field lookups should be in the form '__'") + raise InvalidInputException( + f"Invalid field lookup '{field_lookup}'. Field lookups should be in the form '__'" + ) + + field_lookups = { + "name__icontains": lambda filter_value, file: filter_value.lower() in file.name.lower(), + "name__contains": lambda filter_value, file: filter_value in file.name, + "name__endswith": lambda filter_value, file: file.name.endswith(filter_value), + "name__startswith": lambda filter_value, file: file.name.startswith(filter_value), + "tag__exact": lambda filter_value, file: filter_value in file.tags, + "tag__startswith": lambda filter_value, file: file.tags.startswith(filter_value), + "tag__endswith": lambda filter_value, file: file.tags.endswith(filter_value), + "tag__contains": lambda filter_value, file: file.tags.contains(filter_value), + "sequence__notnone": lambda filter_value, file: file.sequence is not None, + } results = [] + for file in files: - if field_lookup == "name__icontains" and filter_value.lower() in file.name.lower(): - results.append(file) - if field_lookup == "name__contains" and filter_value in file.name: - results.append(file) - if field_lookup == "name__endswith" and file.name.endswith(filter_value): - results.append(file) - if field_lookup == "name__startswith" and file.name.startswith(filter_value): - results.append(file) - if field_lookup == "tag__exact" and filter_value in file.tags: - results.append(file) - if field_lookup == "tag__startswith" and file.tags.startswith(filter_value): - results.append(file) - if field_lookup == "tag__endswith" and file.tags.endswith(filter_value): - results.append(file) - if field_lookup == "tag__contains" and file.tags.contains(filter_value): - results.append(file) - if field_lookup == "sequence__notnone" and file.sequence is not None: + if field_lookups[field_lookup](filter_value, file): results.append(file) return results @@ -159,9 +164,3 @@ def get_file_by_tag(self, tag_string): raise UnexpectedNumberOfResultsException("No files found with this tag") return results[0] - - @property - def path(self): - # Lazily make the folder if absent - isfolder(self._path, make_if_absent=True) - return self._path diff --git a/octue/resources/manifest.py b/octue/resources/manifest.py index bc9ae5ab7..344f9b4fe 100644 --- a/octue/resources/manifest.py +++ b/octue/resources/manifest.py @@ -1,31 +1,61 @@ import logging -from octue.exceptions import InvalidInputException -from octue.mixins import Identifiable, Loggable, Serialisable +from octue.exceptions import InvalidInputException, InvalidManifestException +from octue.mixins import Identifiable, Loggable, Pathable, Serialisable from .dataset import Dataset module_logger = logging.getLogger(__name__) -class Manifest(Serialisable, Loggable, Identifiable): +class Manifest(Pathable, Serialisable, Loggable, Identifiable): """ A representation of a manifest, which can contain multiple datasets This is used to manage all files coming into (or leaving), a data service for an analysis at the configuration, input or output stage. """ - def __init__(self, id=None, logger=None, **kwargs): + def __init__(self, id=None, logger=None, path=None, path_from=None, base_from=None, **kwargs): """ Construct a Manifest """ - super().__init__(id=id, logger=logger) - self.datasets = kwargs.pop("datasets", list()) + super().__init__(id=id, logger=logger, path=path, path_from=path_from, base_from=base_from) + + # TODO The decoders aren't being used; utils.decoders.OctueJSONDecoder should be used in twined + # so that resources get automatically instantiated. + # Add a proper `decoder` argument to the load_json utility in twined so that datasets, datafiles and manifests + # get initialised properly, then tidy up this hackjob. Also need to allow Pathables to update ownership + # (because decoders work from the bottom of the tree upwards, not top-down) + + datasets = kwargs.pop("datasets", list()) self.keys = kwargs.pop("keys", dict()) - self.__dict__.update(**kwargs) - # TODO we need to add keys to the manifest file schema so that we know what dataset(s) map to what keys + # TODO we need to add keys to the manifest file schema in twined so that we know what dataset(s) map to what keys + # In the meantime, we enforce at this level that keys will match + n_keys = len(self.keys.keys()) + n_datasets = len(datasets) + if n_keys != n_datasets: + raise InvalidManifestException( + f"Manifest instantiated with {n_keys} keys, and {n_datasets} datasets... keys must match datasets!" + ) + + # Sort the keys by the dataset index so we have a list of keys in the same order as the dataset list. + # We'll use this to name the dataset folders + key_list = [k for k, v in sorted(self.keys.items(), key=lambda item: item[1])] + + # Instantiate the datasets if not already done + self.datasets = [] + for key, ds in zip(key_list, datasets): + if isinstance(ds, Dataset): + self.datasets.append(ds) + else: + self.datasets.append(Dataset(**ds, path=key, path_from=self)) + + # Instantiate the rest of everything! + self.__dict__.update(**kwargs) def get_dataset(self, key): """ Gets a dataset by its key name (as defined in the twine) + :return: Dataset selected by its key + :rtype: Dataset """ idx = self.keys.get(key, None) if idx is None: @@ -46,6 +76,6 @@ def prepare(self, data): self.keys[dataset_spec["key"]] = idx # TODO generate a unique name based on the filter key, tag datasets so that the tag filters in the spec # apply automatically and generate a description of the dataset - self.datasets.append(Dataset(logger=self.logger)) + self.datasets.append(Dataset(logger=self.logger, path_from=self, path=dataset_spec["key"])) return self diff --git a/octue/runner.py b/octue/runner.py index 96f438fd8..add71e02a 100644 --- a/octue/runner.py +++ b/octue/runner.py @@ -1,6 +1,8 @@ +import importlib import logging +import os +import sys -from octue.cli import AppFrom from octue.resources.analysis import CLASS_MAP, Analysis from octue.utils import gen_uuid from twined import Twine @@ -40,23 +42,31 @@ class Runner: def __init__( self, twine="twine.json", configuration_values=None, configuration_manifest=None, log_level=logging.INFO ): - """ Constructor for the Runner class - """ + """ Constructor for the Runner class. """ # Ensure the twine is present and instantiate it self.twine = Twine(source=twine) + if "configuration_values" not in self.twine.available_strands: + configuration_values = None + + if "configuration_manifest" not in self.twine.available_strands: + configuration_manifest = None + # Validate and initialise configuration data self.configuration = self.twine.validate( configuration_values=configuration_values, configuration_manifest=configuration_manifest, cls=CLASS_MAP, ) + # Set path for configuration manifest. + # TODO this is hacky, we need to rearchitect the twined validation so we can do this kind of thing in there + self.configuration["configuration_manifest"] = self._update_manifest_path( + self.configuration.get("configuration_manifest", None), configuration_manifest, + ) + # Store the log level (same log level used for all analyses) self._log_level = log_level - # Store analyses. Multiple analysis objects can be created and coexist. - self.analyses = {} - def _get_default_handler(self): """ Gets a basic console handler set up for logging analyses """ @@ -85,13 +95,42 @@ def _get_analysis_logger(self, analysis_id, handler=None): return analysis_logger - def run(self, app_src, handler=None, input_values=None, input_manifest=None, credentials=None, children=None): + @staticmethod + def _update_manifest_path(manifest, pathname): + """ A Quick hack to stitch the new Pathable functionality in the 0.1.4 release into the CLI and runner. + + The way we define a manifest path can be more robustly implemented as we migrate functionality into the twined + library + + :param manifest: + :type manifest: + :param pathname: + :type pathname: + :return: + :rtype: + """ + if manifest is not None and hasattr(pathname, "endswith"): + if pathname.endswith(".json"): + manifest.path = os.path.split(pathname)[0] + + # Otherwise do nothing and rely on manifest having its path variable set already + return manifest + + def run( + self, + app_src, + handler=None, + input_values=None, + input_manifest=None, + credentials=None, + children=None, + output_manifest_path=None, + ): """ Run an analysis :parameter app_src: Either: an instance of the AppFrom manager class which has a run() method, or - a function which accepts a single parameter (the instantiated analysis), or a string or path_like pointing - a string or path_like pointing to an application folder (which should contain an 'app.py' function like the - templates). This typically points to the run() function defined in the 'app.py' file. + a function which accepts a single parameter (the instantiated analysis), or a string pointing + to an application folder (which should contain an 'app.py' function like the templates). :type app_src: Union[AppFrom, function, str] :parameter input_values: The input_values strand data. Can be expressed as a string path of a *.json file @@ -114,6 +153,9 @@ def run(self, app_src, handler=None, input_values=None, input_manifest=None, cre already-parsed dict. :type children: Union[str, dict] + :parameter output_manifest_path: Path where output data will be written + :type output_manifest_path: Union[str, path-like] + :parameter handler: the logging.Handler instance which will be used to handle logs for this analysis run. handlers can be created as per the logging cookbook https://docs.python.org/3/howto/logging-cookbook.html but should use the format defined above in LOG_FORMAT. @@ -121,6 +163,8 @@ def run(self, app_src, handler=None, input_values=None, input_manifest=None, cre :return: None """ + if "input_manifest" not in self.twine.available_strands: + input_manifest = None inputs = self.twine.validate( input_values=input_values, @@ -132,8 +176,16 @@ def run(self, app_src, handler=None, input_values=None, input_manifest=None, cre allow_extra=False, ) + # TODO this is hacky, we need to rearchitect the twined validation so we can do this kind of thing in there + inputs["input_manifest"] = self._update_manifest_path(inputs.get("input_manifest", None), input_manifest,) + outputs_and_monitors = self.twine.prepare("monitors", "output_values", "output_manifest", cls=CLASS_MAP) + # TODO this is hacky, we need to rearchitect the twined validation so we can do this kind of thing in there + outputs_and_monitors["output_manifest"] = self._update_manifest_path( + outputs_and_monitors.get("output_manifest", None), output_manifest_path, + ) + analysis_id = gen_uuid() analysis_logger = self._get_analysis_logger(analysis_id, handler) analysis = Analysis( @@ -154,11 +206,69 @@ def run(self, app_src, handler=None, input_values=None, input_manifest=None, cre else: app_src(analysis) - self.twine.validate(output_values=analysis.output_values) - self.twine.validate(output_manifest=analysis.output_manifest) - except Exception as e: analysis_logger.error(str(e)) raise e return analysis + + +def unwrap(fcn): + """ Recurse through wrapping to get the raw function without decorators. + """ + if hasattr(fcn, "__wrapped__"): + return unwrap(fcn.__wrapped__) + return fcn + + +class AppFrom: + """ Context manager that imports module 'app' from user's code base at a location app_path. + + The manager will issue a warning if an existing module called "app" is already loaded. + + The manager makes a temporary addition to the system path (to ensure app is loaded from the correct path) + + The manager will unload the module (by deleting it from sys.modules) on exit, enabling + + with AppFrom('/path/to/dir') as app: + Runner().run(app) + + """ + + def __init__(self, app_path="."): + self.app_path = os.path.abspath(os.path.normpath(app_path)) + module_logger.debug(f"Initialising AppFrom context at app_path {self.app_path}") + self.app_module = None + + def __enter__(self): + # Warn on an app present on the system path + if "app" in sys.modules.keys(): + module_logger.warning( + "Module 'app' already on system path. Using 'AppFrom' context will yield unexpected results. Avoid using 'app' as a python module, except for your main entrypoint" + ) + + # Insert the present directory first on the system path + sys.path.insert(0, self.app_path) + + # Import the app from the present directory + self.app_module = importlib.import_module("app") + + # Immediately clean up the entry to the system path (don't use "remove" because if the user has it in their + # path, this'll be an unexpected side effect, and don't do it in cleanup in case the called code inserts a path) + sys.path.pop(0) + module_logger.debug( + f"Imported app at app_path and cleaned up temporary modification to sys.path {self.app_path}" + ) + + return self + + def __exit__(self, exc_type, exc_value, traceback): + # Unload the imported module + del sys.modules["app"] + module_logger.debug(f"Deleted app from sys.modules and cleaned up (app_path {self.app_path})") + + @property + def run(self): + """ Returns the unwrapped run function from app.py in the application's root directory + """ + return unwrap(self.app_module.run) diff --git a/octue/templates/__init__.py b/octue/templates/__init__.py index 6244b0a20..18b3d12ca 100644 --- a/octue/templates/__init__.py +++ b/octue/templates/__init__.py @@ -1 +1,4 @@ -from .templates import copy_template # noqa: F401 +from .templates import copy_template + + +__all__ = ("copy_template",) diff --git a/octue/templates/template-python-fractal/app.py b/octue/templates/template-python-fractal/app.py index b7271acb3..4de08acb6 100644 --- a/octue/templates/template-python-fractal/app.py +++ b/octue/templates/template-python-fractal/app.py @@ -1,17 +1,15 @@ -import os -import sys - from fractal import fractal -from octue import octue_cli, octue_run, octue_version -@octue_run def run(analysis): """ Your main entrypoint to run the application This is the function that gets run each time somebody requests an analysis from the digital twin / data service. You should write your own code and call it from here. + It needs to be called 'run' and the file must be called 'app.py'; Octue will handle the rest, supplying + you with an "analysis" object with validated inputs for you to process. + ## The Analysis: `analysis` is an instantiated Analysis class object, which you can import here (as shown) or anywhere else in your @@ -39,7 +37,7 @@ def run(analysis): # analysis.logger.info(f"The output directory is {analysis.output_dir}") # analysis.logger.info(f"The tmp directory, where you can store temporary files or caches, is {analysis.tmp_dir}") - # Print statements will get logged (stdout and stderr are mirrored to the log files so you don't miss anything)... + # Print statements will get logged... print("Hello! The app is running!") # noqa: T001 # ... but we encourage you to use the attached logger, which handles sending logs to remote services and allows them @@ -53,27 +51,3 @@ def run(analysis): # Run the code fractal(analysis) - - -@octue_version -def version(): - """ Returns the version number of the application - """ - - # Top Tip: - # For all Octue internal apps, we simply return the git revision of the code. - # Every single commit creates a new version, we can always check out the exact version of the code that ran, and we - # can quickly look up the version state and history on github when we have to debug an app. Sweet! - version_no = os.system("git rev-parse HEAD") - - # Return the version number as a string - return version_no - - -# If running from an IDE or test console, it'll run this file rather than calling the application from the CLI... -# In that case we pass arguments through the CLI just as if it were called from the command line. -if __name__ == "__main__": - - # Invoke the CLI to process the arguments, set up an analysis and run it - args = sys.argv[1:] if len(sys.argv) > 1 else [] - octue_cli(args) diff --git a/octue/templates/template-python-fractal/fractal/__init__.py b/octue/templates/template-python-fractal/fractal/__init__.py index 4ac9f01fa..e11062189 100644 --- a/octue/templates/template-python-fractal/fractal/__init__.py +++ b/octue/templates/template-python-fractal/fractal/__init__.py @@ -1 +1,4 @@ -from .fractal import fractal # noqa: F401 +from .fractal import fractal + + +__all__ = ("fractal",) diff --git a/octue/templates/template-python-fractal/fractal/fractal.py b/octue/templates/template-python-fractal/fractal/fractal.py index 17a0b0f4a..6ee6e800a 100644 --- a/octue/templates/template-python-fractal/fractal/fractal.py +++ b/octue/templates/template-python-fractal/fractal/fractal.py @@ -68,8 +68,7 @@ def fractal(analysis): ) # Actually write the contents to the file specified by the Datafile - # TODO we should write a helper and/or context manager for this, like `df.write_json({})` or `with df('w') as fp:` - with open(df.full_path, "w") as fp: + with open(df.absolute_path, "w") as fp: json.dump( {"data": data, "layout": layout}, fp, cls=OctueJSONEncoder ) # The special encoder just makes it easy to handle numpy arrays diff --git a/octue/templates/template-python-fractal/requirements.txt b/octue/templates/template-python-fractal/requirements.txt index 94c07e767..005948e75 100644 --- a/octue/templates/template-python-fractal/requirements.txt +++ b/octue/templates/template-python-fractal/requirements.txt @@ -1,16 +1,4 @@ -appdirs==1.4.3 -args==0.1.0 -clint==0.5.1 -packaging==20.0 -pyparsing==2.4.6 -six==1.13.0 -octue==0.1.0 -twined==0.0.4 -click==7.0 - -# ----------- Install the current app (as editable, so you can develop on it) ------------------------------------------ - ---editable . +octue==0.1.4 # ----------- Some common libraries ----------------------------------------------------------------------------------- @@ -22,16 +10,11 @@ click==7.0 #plotly==3.6.1 # A numerical manipulation library -numpy==1.16.2 +numpy==1.19.2 -# A powerful database api library. Supply it with your db's uri (through environment variables - don't commit URIs to git!!!!) and read/add data to/from databases. -# Note that results of analyses using externally managed databases as data sources cannot be guaranteed to be idempotent. +# A powerful database api library. Supply it with your db's uri (through environment variables - don't commit URIs +# to git!!!!) and read/add data to/from databases. +# Note: Results from apps that use externally managed data sources cannot be guaranteed to be idempotent (reproducible), +# because the data can change between runs. #SQLAlchemy==1.0.12 #SQLAlchemy-Utils==0.31.6 - - -# ----------- Twined / SDK developer's reference only. Do not uncomment. --------------------------------------------------------------- - -# To develop twined or the SDK within the same environment as an app, install them as editable: -# --editable ../twined -# --editable ../octue-sdk-python diff --git a/octue/templates/template-python-fractal/setup.py b/octue/templates/template-python-fractal/setup.py index 2c7c2a6b5..a212774b2 100644 --- a/octue/templates/template-python-fractal/setup.py +++ b/octue/templates/template-python-fractal/setup.py @@ -6,12 +6,8 @@ def git_version(): return os.system("git rev-parse HEAD") +# This file makes your module installable as a library. It's not essential for running apps with twined. + setup( - name="template-python-fractal", # Change this if you want to make your module installable as a library - version=git_version(), - py_modules=["app"], - entry_points=""" - [console_scripts] - octue-app=app:octue_app - """, + name="template-python-fractal", version=git_version(), py_modules=["app"], ) diff --git a/octue/templates/template-using-manifests/.gitignore b/octue/templates/template-using-manifests/.gitignore new file mode 100644 index 000000000..62a0644fd --- /dev/null +++ b/octue/templates/template-using-manifests/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# Jetbrains project structures +.idea/ \ No newline at end of file diff --git a/octue/templates/template-using-manifests/app.py b/octue/templates/template-using-manifests/app.py new file mode 100644 index 000000000..ca23663fd --- /dev/null +++ b/octue/templates/template-using-manifests/app.py @@ -0,0 +1,100 @@ +from cleaner import clean, read_csv_files, read_dat_file + +from octue.resources import Datafile + + +def run(analysis): + """ An app to read a time series of files from a dataset, clean them and write a new, cleaned, dataset. + + See the "fractal" template for an introduction to the analysis object and the purpose of this 'run' function. + + Here, let's create an example application designed to clean up CSV data files produced by an instrument, in this + case a meteorological mast. + + The aim of this example is to teach you how to use input and output file manifests in an app - so what we'll do is: + - Use the input manifest to read a sequence of files + - Perform a simple transformation on some of the data (as if we were doing a data cleaning process) + - Create new files containing the cleaned data + - Add them to the output manifest + + """ + + # You can use the attached logger to record debug statements, general information, warnings or errors + analysis.logger.info(f"Starting clean up of files in {analysis.input_dir}") + + # Get the configuration value for our time averaging window (or if not present, use the default specified in + # the twine) + time_window = (analysis.configuration_values.get("time_window", 600),) + analysis.logger.info(f"Averaging window set to {time_window}s") + + # Get the input dataset which will be read in + input_dataset = analysis.input_manifest.get_dataset("raw_met_mast_data") + + # There are two types of files in the dataset. Metadata file(s), saved daily, and measurement files (saved hourly). + # Because a manifest has been created, we're able to get this data out easily with the dataset filtering + # capabilities. Let's get the metadata and the timeseries files, whilst showing off a couple of the filters. + # + # See the Dataset class help for more. + metadata_file = input_dataset.get_file_by_tag("meta") + timeseries_files = input_dataset.get_file_sequence("tag__exact", filter_value="timeseries") + # + # We used these because they're special helpers - in this case ensuring that there's only one metadata file and + # ensuring that the timeseries files come in a strictly ordered sequence. + # + # We could also have picked up one or more files using general filters, like so: + # metadata_files = input_dataset.get_files("name__icontains", filter_value="meta") + # + # There's generally a few ways to do it. Choose one which is likely to be most consistent - for example if your + # filenames might be subject to change, but you have better control over the tags, rely on those. + + # At this point it's over to you, to do whatever you want with the contents of these files. + # For this example app, we will: + # + # Use a custom function to read in the strange metadata file that came with the dataset + metadata = read_dat_file(metadata_file) + # + # Read the sequence of CSV files and concatenate into a pandas dataframe (like a table) + data = read_csv_files(timeseries_files) + # + # Clean the timeseries data up + data = clean(data, metadata["date"]) + + # The twine specifies an output dataset, so it's already been created for you (although right now its empty, of + # course, because we haven't done the processing yet)... + output_dataset = analysis.output_manifest.get_dataset("cleaned_met_mast_data") + + # We'll add tags to the output dataset, which will help to improve searchability and allow + # other apps, reports, users and analyses to automatically find figures and + # use them. + # + # Get descriptive with tags... they are whitespace-delimited and colons can be + # used to provide subtags. Tags are case insensitive, and accept a-z, 0-9, + # hyphens and underscores (which can be used literally in search and are also + # used to separate words in natural language search). Other special characters + # will be stripped. + output_dataset.tags = "met mast cleaned" + + # Create a Datafile to hold the concatenated, cleaned output data. We could put it in the current directory + # (by leaving local_path_prefix unspecified) but it makes sense to put it in a folder specific to this output + # dataset - doing so avoids any race conditions arising (if other instances of this application are running at the + # same time), and avoids storage leaks, because files get cleaned up correctly. + timeseries_datafile = Datafile( + path="cleaned.csv", + path_from=output_dataset, # Tells it where it should be stored, in this case the output dataset folder + skip_checks=True, # We haven't created the actual file yet, so checks would definitely fail! + tags="timeseries", + ) + + # Write the file (now we know where to write it) + with timeseries_datafile.open("w") as fp: + data.to_csv(path_or_buf=fp) + + # And finally we add it to the output + output_dataset.append(timeseries_datafile) + + # We're done! There's only one datafile in the output dataset, but you could create thousands more and append them + # all :) + # + # If you're running this on your local machine, that's it - but when this code runs as an analysis in the cloud, + # The files in the output manifest are copied into the cloud store. Their names and tags are registered in a search + # index so your colleagues can find the dataset you've produced. diff --git a/octue/templates/template-using-manifests/cleaner/__init__.py b/octue/templates/template-using-manifests/cleaner/__init__.py new file mode 100644 index 000000000..09168d30f --- /dev/null +++ b/octue/templates/template-using-manifests/cleaner/__init__.py @@ -0,0 +1,6 @@ +from .clean import clean +from .read_csv_files import read_csv_files +from .read_dat_file import read_dat_file + + +__all__ = ("read_dat_file", "read_csv_files", "clean") diff --git a/octue/templates/template-using-manifests/cleaner/clean.py b/octue/templates/template-using-manifests/cleaner/clean.py new file mode 100644 index 000000000..a80ff4e81 --- /dev/null +++ b/octue/templates/template-using-manifests/cleaner/clean.py @@ -0,0 +1,52 @@ +import re +from datetime import datetime, timezone +from stringcase import snakecase + + +def clean(data, date): + """ Clean up data from meteorological mast anemometers and wind vanes... + + ... or rename the function and do pretty much whatever you want with the data! + + In this ultra-simple example, we'll parse the time and add the date to it (timestamps are much more useful when + complete!) as well as correcting a typo in column headings + + :parameter data: Pandas dataframe containing imported, uncleaned data + :type data: pandas.dataframe + + :return: The same dataframe, cleaned. + :rtype: pandas.dataframe + """ + + # Add a proper datestamp column (combining the time from the time column with the date from the metadata) + cleaned_timestamps = [] + for ts in data["TimeStamp"]: + # Decimal minutes. REALLY? This is why our work is so hard. Learn from the gross lack of foresight of whoever + # made this meteorological mast, and put data into sensible formats (ISO formatted timestamp would've been + # really helpful here!). + hms = [int(value) for value in re.split(r"[:.]", ts)] + hms[2] = hms[2] * 6 + cleaned_timestamps.append( + datetime( + year=date.year, + month=date.month, + day=date.day, + hour=hms[0], + minute=hms[0], + second=hms[1], + tzinfo=timezone.utc, # UTC is a best guess since no timezone info supplied with the data. + ) + ) + + # Replace the near-useless decimal-minutes-no-date-no-timezone with a cleaned, complete datestamp + data.pop("TimeStamp") + data["time_stamp"] = cleaned_timestamps # If storage space is a premium, convert this to posix time + + # And clean up typos made by an overly stressed out junior engineer who is too busy to care about the pain this + # has caused you finding it + data["Barometer_1"] = data.pop("Barmoeter_1") + + # And convert column names to snake_case, so they make sensible variable names without spaces etc + data.rename(columns=lambda old_name: snakecase(old_name)) + + return data diff --git a/octue/templates/template-using-manifests/cleaner/read_csv_files.py b/octue/templates/template-using-manifests/cleaner/read_csv_files.py new file mode 100644 index 000000000..982edcfe7 --- /dev/null +++ b/octue/templates/template-using-manifests/cleaner/read_csv_files.py @@ -0,0 +1,23 @@ +import pandas + + +def read_csv_files(files): + """ Read a sequence of CSV files file containing meteorological mast anemometer and wind vane data + + You don't really need to care about this, because your files are unlikely to be in the same form as our + example csv files. But for the sake of a complete example, we show you how we'd read these in here. + + :parameter files: List of the file names to read in and concatenate + :type files: list(octue.Datafile) + + :return: Pandas dataframe containing the imported, uncleaned data + :rtype: pandas.dataframe + """ + + # This is a simple concatenation. If you have a huge dataset, it's worth getting into working with remote files on + # the cloud and/or doing this in batches. + frames = [] + for file in files: + frames.append(pandas.read_csv(file.absolute_path)) + + return pandas.concat(frames) diff --git a/octue/templates/template-using-manifests/cleaner/read_dat_file.py b/octue/templates/template-using-manifests/cleaner/read_dat_file.py new file mode 100644 index 000000000..df5b4b168 --- /dev/null +++ b/octue/templates/template-using-manifests/cleaner/read_dat_file.py @@ -0,0 +1,41 @@ +from dateparser import parse + + +def read_dat_file(file): + """ Read a dat file containing meteorological mast metadata + + You don't really need to care about this, because your files are unlikely to be in the same form as our + example dat files. You'll need to build your own file readers and test them (or use open-source libraries to do + the reading). + + So this is only here to make the example work. Although you could use this as a case study of how annoying it is to + parse custom formatted data files - even "simple" ones! + + :parameter file: File from the dataset that contains metadata + :type file: octue.Datafile + + :return: Dictionary of available metadata + :rtype: dict + """ + + # Read the file + with open(file.absolute_path, "r") as text_file: + lines = text_file.readlines() + + # Parse the metadata + metadata = dict() + for line in lines: + # In the proprietary example dat file, tabs separate parameters from their value, so if a line contains a tab, + # import that parameter-value pair + if ":" in line: + param = line.split(":", 1) + key = param[0].strip().lower() + value = param[1].strip() + value = parse(value) if key == "date" else float(value) + metadata[key] = value + + # Make sure that a `date` field is in the metadata, as we'll need it later + if "date" not in metadata.keys(): + raise ValueError("No DATE field in the metadata file (required)") + + return metadata diff --git a/octue/templates/template-using-manifests/data/configuration/values.json b/octue/templates/template-using-manifests/data/configuration/values.json new file mode 100644 index 000000000..60e2195a9 --- /dev/null +++ b/octue/templates/template-using-manifests/data/configuration/values.json @@ -0,0 +1,3 @@ +{ + "window_size": 600 +} diff --git a/octue/templates/template-using-manifests/data/input/manifest.json b/octue/templates/template-using-manifests/data/input/manifest.json new file mode 100644 index 000000000..217307085 --- /dev/null +++ b/octue/templates/template-using-manifests/data/input/manifest.json @@ -0,0 +1,54 @@ +{ + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "keys": { + "raw_met_mast_data": 0 + }, + "datasets": [ + { + "id": "7ead4669-8162-4f64-8cd5-4abe92509e17", + "name": "meteorological mast dataset", + "tags": "met mast wind location:108346", + "files": [ + { + "path": "08DEC/High Res Meteorological Mast Data - 8 Dec_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "tags": "timeseries", + "posix_timestamp": 1605783547.0, + "id": "acff07bc-7c19-4ed5-be6d-a6546eae8e86", + "last_modified": "2019-12-08T01:00:00.533005Z", + "name": "High Res Meteorological Mast Data - 8 Dec_1.csv", + "size_bytes": 4443, + "sha-512/256": "somesha" + }, + { + "path": "08DEC/High Res Meteorological Mast Data - 8 Dec_2.csv", + "cluster": 0, + "sequence": 1, + "extension": "csv", + "tags": "timeseries", + "posix_timestamp": 1605783547.0, + "id": "bdff07bc-7c19-4ed5-be6d-a6546eae8e45", + "last_modified": "2019-12-08T02:00:00.433041Z", + "name": "High Res Meteorological Mast Data - 8 Dec_2.csv", + "size_bytes": 4471, + "sha-512/256": "someothersha" + }, + { + "path": "08DEC/meta - 8 Dec.dat", + "cluster": 1, + "sequence": 0, + "extension": "dat", + "tags": "meta", + "posix_timestamp": 1605783547.0, + "id": "ceff07bc-7c19-4ed5-be6d-a6546eae8e86", + "last_modified": "2019-12-08T00:00:00.213436", + "name": "meta - 8 Dec_1.da", + "size_bytes": 4443, + "sha-512/256": "somesha" + } + ] + } + ] +} diff --git a/octue/templates/template-using-manifests/data/input/raw_met_mast_data/08DEC/High Res Meteorological Mast Data - 8 Dec_1.csv b/octue/templates/template-using-manifests/data/input/raw_met_mast_data/08DEC/High Res Meteorological Mast Data - 8 Dec_1.csv new file mode 100644 index 000000000..6ac275b3f --- /dev/null +++ b/octue/templates/template-using-manifests/data/input/raw_met_mast_data/08DEC/High Res Meteorological Mast Data - 8 Dec_1.csv @@ -0,0 +1,60 @@ +TimeStamp,Barometer_2,Barmoeter_1,temp_2,temp_1,WindVane_2,WindVane_1,Anemo_2,Anemo_1 +00:00.4,1040.983,1022.315,5.88366,6.89383,135.9531,275.6715,5.7752,5.8231 +00:01.4,1040.971,1022.297,5.88061,6.88772,135.9531,275.3199,5.8212,5.8231 +00:02.4,1040.983,1022.297,5.88366,6.89993,135.9531,275.3199,5.8672,5.8231 +00:03.4,1040.971,1022.297,5.88061,6.88467,135.9531,275.3199,5.7752,5.869 +00:04.5,1040.983,1022.297,5.88061,6.86941,136.3047,275.3199,5.8212,5.8231 +00:05.5,1040.971,1022.279,5.87146,6.87857,136.3047,275.6715,5.8672,5.9609 +00:06.5,1040.983,1022.279,5.88977,6.86941,136.6562,275.6715,5.9132,5.915 +00:07.5,1040.971,1022.279,5.87756,6.87246,135.6016,274.9684,5.8672,5.915 +00:08.5,1040.983,1022.315,5.88061,6.86331,135.9531,275.3199,5.9132,5.9609 +00:09.6,1040.971,1022.297,5.87756,6.87246,135.6016,274.9684,5.9592,6.0068 +00:10.6,1040.971,1022.297,5.88061,6.87246,135.6016,274.6168,5.9592,6.0068 +00:11.6,1040.971,1022.297,5.88061,6.86331,135.6016,274.9684,5.9592,6.0068 +00:12.6,1040.971,1022.297,5.88061,6.87857,135.25,274.2652,5.9132,5.869 +00:13.7,1040.983,1022.297,5.88672,6.86636,134.8984,274.2652,5.9592,5.9609 +00:14.7,1040.983,1022.297,5.88061,6.86026,134.8984,273.9137,5.8672,6.0068 +00:15.7,1040.971,1022.297,5.87146,6.8572,134.8984,273.9137,5.8672,6.0527 +00:16.7,1040.971,1022.315,5.87146,6.87857,134.5469,273.9137,5.9592,6.0068 +00:17.7,1040.971,1022.315,5.87451,6.86331,134.8984,274.2652,5.9132,5.915 +00:18.8,1040.971,1022.297,5.87451,6.86026,134.8984,274.2652,5.9132,5.915 +00:19.8,1040.983,1022.297,5.87756,6.86941,134.1953,273.2106,5.9132,6.0068 +00:20.8,1040.971,1022.315,5.87756,6.86026,134.1953,273.2106,5.9132,6.0068 +00:21.8,1040.971,1022.315,5.87146,6.86941,134.1953,272.859,5.8672,5.9609 +00:22.8,1040.971,1022.279,5.87451,6.86026,134.1953,272.859,6.0052,5.9609 +00:23.8,1040.971,1022.297,5.87451,6.86026,134.5469,273.2106,5.9592,5.915 +00:24.9,1040.983,1022.297,5.87756,6.86331,135.25,274.2652,6.0052,6.0068 +00:25.9,1040.983,1022.315,5.87146,6.8572,134.1953,273.9137,6.0052,5.915 +00:26.9,1040.983,1022.297,5.87146,6.86026,135.25,273.9137,6.0511,5.915 +00:27.9,1040.983,1022.297,5.86535,6.85415,135.9531,273.9137,6.0052,5.9609 +00:28.9,1040.922,1022.297,5.87451,6.8572,135.6016,274.2652,5.9592,5.9609 +00:29.9,1040.983,1022.279,5.87451,6.86331,134.8984,274.2652,5.9592,6.0068 +00:31.0,1040.971,1022.297,5.87756,6.845,135.25,274.2652,5.9592,5.9609 +00:32.0,1040.983,1022.297,5.8684,6.85415,135.25,274.6168,5.9132,5.9609 +00:33.0,1040.971,1022.315,5.87146,6.845,135.25,274.2652,5.9132,5.9609 +00:34.0,1040.983,1022.297,5.87146,6.87246,135.25,275.6715,5.9592,6.0068 +00:35.0,1040.971,1022.297,5.8684,6.85415,134.8984,274.9684,6.0511,6.0986 +00:36.1,1040.971,1022.315,5.87146,6.83889,133.4922,273.2106,6.0511,6.0068 +00:37.1,1040.983,1022.315,5.87146,6.8511,134.5469,273.9137,6.0511,5.9609 +00:38.1,1040.971,1022.315,5.8684,6.845,134.5469,273.9137,6.1431,6.0986 +00:39.1,1040.983,1022.297,5.87146,6.84805,135.25,273.9137,6.0511,6.0986 +00:40.1,1040.971,1022.297,5.86535,6.8572,135.25,274.6168,6.0971,6.0068 +00:41.2,1040.971,1022.297,5.86535,6.845,135.9531,275.6715,6.1431,6.0527 +00:42.2,1040.983,1022.297,5.86535,6.86636,135.25,274.2652,6.2351,6.1446 +00:43.2,1040.971,1022.297,5.86535,6.84805,133.8438,273.9137,6.0971,5.9609 +00:44.2,1040.971,1022.297,5.87146,6.8572,135.25,273.2106,6.0971,6.0986 +00:45.2,1040.971,1022.297,5.8623,6.83889,133.8438,273.9137,6.0971,6.1446 +00:46.2,1040.983,1022.297,5.8684,6.86941,134.1953,273.5621,6.0971,6.0527 +00:47.2,1040.971,1022.315,5.8684,6.86636,134.5469,273.9137,6.1431,6.0986 +00:48.2,1040.983,1022.297,5.86535,6.86636,135.6016,273.9137,6.0511,6.0527 +00:49.2,1040.922,1022.297,5.86535,6.86026,135.25,272.1559,5.9592,6.0527 +00:50.3,1040.971,1022.315,5.86535,6.84194,135.9531,274.2652,5.9132,6.0986 +00:51.3,1040.971,1022.315,5.86535,6.85415,136.3047,275.6715,5.7752,6.0527 +00:52.3,1040.971,1022.297,5.86535,6.85415,136.6562,274.9684,5.7752,5.869 +00:53.3,1040.983,1022.297,5.85925,6.85415,136.3047,277.4293,5.7752,5.7772 +00:54.3,1040.983,1022.315,5.85925,6.84805,136.3047,276.7262,5.7292,5.8231 +00:55.4,1040.91,1022.297,5.8684,6.8511,137.7109,274.6168,5.9132,5.869 +00:56.4,1040.971,1022.297,5.8623,6.83889,137.3594,279.1871,5.8672,5.915 +00:57.4,1040.983,1022.315,5.85925,6.8572,135.9531,277.4293,5.9132,5.869 +00:58.4,1040.983,1022.315,5.8623,6.85415,136.6562,275.6715,5.9132,5.8231 +00:59.4,1040.983,1022.297,5.8623,6.8572,137.7109,275.3199,5.9132,6.0527 diff --git a/octue/templates/template-using-manifests/data/input/raw_met_mast_data/08DEC/High Res Meteorological Mast Data - 8 Dec_2.csv b/octue/templates/template-using-manifests/data/input/raw_met_mast_data/08DEC/High Res Meteorological Mast Data - 8 Dec_2.csv new file mode 100644 index 000000000..f412fcfa8 --- /dev/null +++ b/octue/templates/template-using-manifests/data/input/raw_met_mast_data/08DEC/High Res Meteorological Mast Data - 8 Dec_2.csv @@ -0,0 +1,60 @@ +TimeStamp,Barometer_2,Barmoeter_1,temp_2,temp_1,WindVane_2,WindVane_1,Anemo_2,Anemo_1 +01:00.4,1040.922,1022.297,5.85925,6.83279,137.7109,277.0777,5.8212,6.0068 +01:01.5,1040.983,1022.315,5.85925,6.84805,138.7656,278.8356,5.8212,5.915 +01:02.5,1040.971,1022.297,5.8623,6.8511,140.1719,279.1871,5.7752,5.869 +01:03.5,1040.983,1022.297,5.86535,6.8572,139.4688,279.8902,5.8672,5.915 +01:04.5,1040.983,1022.297,5.8623,6.82363,140.875,279.1871,5.8672,5.8231 +01:05.5,1040.971,1022.297,5.8623,6.8572,140.1719,279.5387,5.8212,5.9609 +01:06.6,1040.983,1022.297,5.8562,6.83584,140.875,280.2418,5.9132,5.869 +01:07.6,1040.983,1022.297,5.85925,6.83279,140.1719,280.2418,5.7752,5.869 +01:08.6,1040.971,1022.315,5.8623,6.845,140.5234,279.1871,5.7292,5.8231 +01:09.6,1040.983,1022.315,5.86535,6.82669,141.2266,280.2418,5.6832,5.7313 +01:10.6,1040.983,1022.297,5.85314,6.84194,138.7656,278.8356,5.6372,5.6854 +01:11.7,1040.971,1022.315,5.85314,6.85415,139.4688,277.7809,5.6372,5.6394 +01:12.7,1040.971,1022.297,5.8623,6.83584,139.4688,279.1871,5.5912,5.5935 +01:13.7,1040.971,1022.297,5.8562,6.82974,139.8203,280.2418,5.5912,5.6854 +01:14.7,1040.983,1022.315,5.8562,6.84194,141.2266,278.484,5.6372,5.5935 +01:15.7,1040.971,1022.297,5.85314,6.84194,139.8203,279.1871,5.5912,5.6854 +01:16.7,1040.983,1022.297,5.85314,6.83584,139.1172,279.1871,5.6372,5.6854 +01:17.8,1040.971,1022.297,5.85314,6.80837,138.0625,278.484,5.6372,5.6854 +01:18.8,1040.983,1022.297,5.85314,6.83584,139.4688,277.0777,5.7292,5.6854 +01:19.8,1040.983,1022.297,5.8562,6.84805,138.7656,278.8356,5.7292,5.6394 +01:20.8,1040.983,1022.297,5.8562,6.83584,139.1172,278.1324,5.7292,5.6394 +01:21.8,1040.983,1022.297,5.8562,6.81448,140.1719,279.5387,5.6372,5.5935 +01:22.8,1040.983,1022.315,5.85314,6.83584,141.2266,278.8356,5.6832,5.7772 +01:23.9,1040.971,1022.297,5.85314,6.82363,141.5781,280.2418,5.6372,5.8231 +01:24.9,1040.983,1022.297,5.85314,6.82058,140.5234,280.9449,5.6832,5.7313 +01:25.9,1040.983,1022.297,5.84704,6.82058,139.8203,278.484,5.7292,5.7772 +01:26.9,1040.983,1022.279,5.85009,6.82058,140.5234,279.8902,5.7292,5.7772 +01:27.9,1040.983,1022.315,5.84704,6.81448,139.1172,278.484,5.6832,5.7313 +01:28.9,1040.983,1022.315,5.84704,6.82363,139.4688,279.8902,5.7752,5.6394 +01:29.9,1040.983,1022.297,5.84094,6.83584,138.0625,277.4293,5.8672,5.8231 +01:31.0,1040.983,1022.297,5.85009,6.81753,137.0078,277.7809,5.7292,5.7313 +01:32.0,1040.971,1022.297,5.85314,6.82974,137.7109,276.0231,5.8672,5.8231 +01:33.0,1040.971,1022.297,5.85314,6.82058,137.7109,276.7262,5.7752,5.869 +01:34.0,1040.971,1022.297,5.85314,6.81753,137.3594,276.0231,5.7752,5.869 +01:35.0,1040.971,1022.279,5.84704,6.81753,138.4141,278.484,5.7752,5.8231 +01:36.0,1040.91,1022.279,5.85009,6.82669,138.7656,281.6481,5.6832,5.7313 +01:37.1,1040.983,1022.279,5.85009,6.79617,140.5234,277.4293,5.7752,5.6854 +01:38.1,1040.983,1022.279,5.84704,6.80532,140.1719,277.7809,5.6832,5.7772 +01:39.1,1040.971,1022.297,5.84399,6.79922,136.6562,280.2418,5.6372,5.7772 +01:40.1,1040.983,1022.297,5.84704,6.82058,137.0078,275.6715,5.6832,5.7313 +01:41.1,1040.971,1022.315,5.84704,6.82363,136.6562,276.0231,5.5912,5.8231 +01:42.1,1040.983,1022.279,5.84399,6.79006,137.0078,276.0231,5.5453,5.6854 +01:43.1,1040.983,1022.297,5.84704,6.79617,136.3047,275.6715,5.5912,5.6394 +01:44.2,1040.983,1022.315,5.84704,6.79312,136.6562,276.0231,5.5912,5.6394 +01:45.2,1040.983,1022.279,5.84094,6.80837,137.0078,276.3746,5.5912,5.5935 +01:46.2,1040.983,1022.297,5.83789,6.80837,136.3047,276.0231,5.5912,5.5017 +01:47.2,1040.971,1022.297,5.84094,6.81448,136.3047,275.6715,5.5912,5.6394 +01:48.3,1040.983,1022.297,5.84704,6.81143,137.0078,275.6715,5.6832,5.6854 +01:49.3,1040.983,1022.297,5.84399,6.82974,137.0078,276.3746,5.7752,5.6854 +01:50.3,1040.983,1022.297,5.84399,6.79617,139.4688,276.7262,5.8672,5.8231 +01:51.3,1040.971,1022.297,5.84399,6.80227,139.8203,277.7809,5.8212,5.869 +01:52.3,1041.032,1022.297,5.84399,6.79312,138.4141,278.1324,5.8212,5.915 +01:53.3,1041.02,1022.297,5.84704,6.81143,140.5234,277.0777,5.9592,5.8231 +01:54.3,1040.983,1022.297,5.83789,6.79922,141.5781,277.7809,5.9132,5.915 +01:55.4,1040.971,1022.279,5.84399,6.78701,138.7656,276.3746,5.9592,5.915 +01:56.4,1040.971,1022.279,5.84399,6.76565,139.1172,277.7809,5.9132,6.0527 +01:57.4,1040.983,1022.297,5.84399,6.79617,135.25,277.4293,5.9132,5.915 +01:58.4,1040.983,1022.297,5.83789,6.79006,138.4141,276.3746,6.0052,6.0527 +01:59.4,1040.971,1022.279,5.84399,6.79006,136.3047,275.6715,6.0971,6.0068 diff --git a/octue/templates/template-using-manifests/data/input/raw_met_mast_data/08DEC/meta - 8 Dec.dat b/octue/templates/template-using-manifests/data/input/raw_met_mast_data/08DEC/meta - 8 Dec.dat new file mode 100644 index 000000000..d0b9f98a2 --- /dev/null +++ b/octue/templates/template-using-manifests/data/input/raw_met_mast_data/08DEC/meta - 8 Dec.dat @@ -0,0 +1,17 @@ +DATE: 8 December 2019 +SYS_TEMP: 17.06 +COMMENTS +######## +Yes, this is a *really* bad way for anyone to decide to store metadata... And CSVs are a horribly inefficient way of +storing numeric data (as well as being prone corruption e.g. if data has missing values, or strings have commas, etc). + +If you're creating datasets, don't do this. It's a completely custom file format, totally crazy and not easily machine +readable. + +Do yourself, and everybody else, a favour and write your metadata as JSON, and if you have heavyweight numeric data +consider HDF5 (or netCDF, or similar) formats which will be more compact and robust than CSVs. + +Why are we giving you this example, then? + +Because we've seen this, and much worse, many times in real life. We think it's better for examples to be based on +actual real data, rather than the perfect ideal scenario. diff --git a/octue/templates/template-using-manifests/requirements.txt b/octue/templates/template-using-manifests/requirements.txt new file mode 100644 index 000000000..448b6cfe1 --- /dev/null +++ b/octue/templates/template-using-manifests/requirements.txt @@ -0,0 +1,30 @@ +octue==0.1.4 + + +# ----------- Some common libraries ----------------------------------------------------------------------------------- + +# You may wish to include these. Removing these may break the examples, but won't break the template :) + + +# Plotting tools to help create json based figure files (more powerful and better validation than creating the raw json yourself) +#plotly==3.6.1 + +# A numerical manipulation library +numpy==1.19.2 + +# A library for operating on tabulated data +pandas + +# An incredibly powerful date parsing utility +dateparser==1.0.0 + +# A utility library for converting text cases; useful for cleaning up column names and such +stringcase==1.2.0 + + +# A powerful database api library. Supply it with your db's uri (through environment variables - don't commit URIs +# to git!!!!) and read/add data to/from databases. +# Note: Results from apps that use externally managed data sources cannot be guaranteed to be idempotent (reproducible), +# because the data can change between runs. +#SQLAlchemy==1.0.12 +#SQLAlchemy-Utils==0.31.6 diff --git a/octue/templates/template-using-manifests/setup.py b/octue/templates/template-using-manifests/setup.py new file mode 100644 index 000000000..708dfed04 --- /dev/null +++ b/octue/templates/template-using-manifests/setup.py @@ -0,0 +1,13 @@ +import os +from setuptools import setup + + +def git_version(): + return os.system("git rev-parse HEAD") + + +# This file makes your module installable as a library. It's not essential for running apps with twined. + +setup( + name="template-using-manifests", version=git_version(), py_modules=["app"], +) diff --git a/octue/templates/template-using-manifests/twine.json b/octue/templates/template-using-manifests/twine.json new file mode 100644 index 000000000..7bf9e2324 --- /dev/null +++ b/octue/templates/template-using-manifests/twine.json @@ -0,0 +1,29 @@ +{ + "configuration_values_schema": { + "title": "Configuration for the manifest example app", + "description": "The app cleans up raw CSV files, correcting column names and adding the right timestamps.", + "type": "object", + "properties": { + "time_window": { + "description": "The time window in seconds over which results will be averaged", + "type": "integer", + "minimum": 1, + "default": 600 + } + } + }, + "input_manifest": [ + { + "key": "raw_met_mast_data", + "purpose": "A dataset containing .csv files of raw meteorological mast data which we need to clean up", + "filters": "tags:(met AND mast) files:(extension:csv)" + } + ], + "output_manifest": [ + { + "key": "cleaned_met_mast_data", + "purpose": "A dataset containing .csv files of cleaned meteorological mast data", + "filters": "tags:(met AND mast AND cleaned) files:(extension:csv)" + } + ] +} diff --git a/octue/templates/templates.py b/octue/templates/templates.py index 74350c89b..b57433c1f 100644 --- a/octue/templates/templates.py +++ b/octue/templates/templates.py @@ -8,7 +8,7 @@ # TODO add ONLINE_TEMPLATES and combine to AVAILABLE_TEMPLATES, to enable us to seamlessly deliver templates that are # more complex (or have lots of data that we don't want packaged with the module). -PACKAGED_TEMPLATES = tuple(filter(lambda name: name.startswith("template-"), resource_listdir("octue", "templates"))) +PACKAGED_TEMPLATES = tuple(name for name in resource_listdir("octue", "templates") if name.startswith("template-")) def copy_template(template_name, destination_dir="."): diff --git a/octue/utils/__init__.py b/octue/utils/__init__.py index 2c4e00e2b..2923eb13d 100644 --- a/octue/utils/__init__.py +++ b/octue/utils/__init__.py @@ -1,3 +1,6 @@ -from .gen_uuid import gen_uuid # noqa: F401 -from .isfile import isfile # noqa: F401 -from .isfolder import isfolder # noqa: F401 +from .gen_uuid import gen_uuid +from .isfile import isfile +from .isfolder import isfolder + + +__all__ = "gen_uuid", "isfile", "isfolder" diff --git a/octue/utils/folders.py b/octue/utils/folders.py index a8b297128..e2bc08970 100644 --- a/octue/utils/folders.py +++ b/octue/utils/folders.py @@ -1,40 +1,25 @@ import os -from octue import exceptions -from .isfolder import isfolder +from octue.definitions import OUTPUT_STRANDS, STRAND_FILENAME_MAP -FOLDERS = ( - "configuration", - "input", - "log", - "tmp", - "output", -) +def get_file_name_from_strand(strand, path): + """ Where values or manifest are contained in a local file, assemble that filename. + For output directories, the directory will be made if it doesn't exist. This is not true for input directories + for which validation of their presence is handled elsewhere. -def from_path(path_hints, folders=FOLDERS): - """ NOT IMPLEMENTED YET - Helper to find paths to individual configurations from hints - TODO Fix this + :param strand: The name of the strand + :type strand: basestring + + :param path: The directory where the file is / will be saved + :type path: path-like + + :return: A file name for the strand + :rtype: path-like """ - # Set paths - paths = dict() - if isinstance(path_hints, str): - if not os.path.isdir(path_hints): - raise exceptions.FolderNotFoundException(f"Specified data folder '{path_hints}' not present") - - paths = dict([(folder, os.path.join(path_hints, folder)) for folder in folders]) - - else: - if ( - not isinstance(paths, dict) - or (len(paths.keys()) != len(folders)) - or not all([k in folders for k in paths.keys()]) - ): - raise exceptions.InvalidInputException( - f"Input 'paths' should be a dict containing directory paths with the following keys: {folders}" - ) - - # Ensure paths exist on disc?? - for folder in FOLDERS: - isfolder(paths[folder], make_if_absent=True) + + if strand in OUTPUT_STRANDS: + os.makedirs(path, exist_ok=True) + + return os.path.join(path, STRAND_FILENAME_MAP[strand]) diff --git a/requirements-dev.txt b/requirements-dev.txt index 4099d9a0c..c07286d2f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -24,10 +24,14 @@ twine # <---- nothing to do with the twined library! -r docs/requirements.txt -# Templates +# Template App Dependencies +# (Used in template apps but not necessarily dependencies of the library) # ------------------------------------------------------------------------------ -numpy # Used in template apps but not necessarily a dependency of the library +numpy==1.19.2 +dateparser==1.0.0 +pandas==1.1.4 +stringcase==1.2.0 # Current library # Installs any dependencies in setup.py diff --git a/setup.py b/setup.py index c7952251b..193a74cd9 100644 --- a/setup.py +++ b/setup.py @@ -17,9 +17,9 @@ setup( name="octue", - version="0.1.3", + version="0.1.4", py_modules=["cli"], - install_requires=["click>=7.1.2", "twined==0.0.12"], # Dev note: you also need to bump twined in tox.ini + install_requires=["click>=7.1.2", "twined==0.0.13"], # Dev note: you also need to bump twined in tox.ini url="https://www.github.com/octue/octue-sdk-python", license="MIT", author="Thomas Clark (github: thclark)", @@ -29,6 +29,10 @@ zip_safe=False, # Allows copying of templates as whole directory trees packages=find_packages(exclude=("tests", "docs")), include_package_data=True, + entry_points=""" + [console_scripts] + octue-app=octue.cli:octue_cli + """, classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers", diff --git a/tests/__init__.py b/tests/__init__.py index e69de29bb..7e690b154 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,4 @@ +import os + + +TESTS_DIR = os.path.dirname(__file__) diff --git a/tests/app.py b/tests/app.py new file mode 100644 index 000000000..ebd6f76ff --- /dev/null +++ b/tests/app.py @@ -0,0 +1,5 @@ +CUSTOM_APP_RUN_MESSAGE = "This is a custom app run function" + + +def run(analysis): + print(CUSTOM_APP_RUN_MESSAGE) # noqa:T001 diff --git a/tests/base.py b/tests/base.py index 6beca6769..995b8550d 100644 --- a/tests/base.py +++ b/tests/base.py @@ -14,8 +14,8 @@ def setUp(self): # Set up paths to the test data directory and to the app templates directory root_dir = os.path.dirname(os.path.abspath(__file__)) - self.data_path = str(os.path.join(root_dir, "data", "")) - self.templates_path = str(os.path.join(os.path.dirname(root_dir), "octue", "templates", "")) + self.data_path = os.path.join(root_dir, "data") + self.templates_path = os.path.join(os.path.dirname(root_dir), "octue", "templates") super().setUp() def callCli(self, args): diff --git a/tests/data/data_dir_with_no_manifests/configuration/values.json b/tests/data/data_dir_with_no_manifests/configuration/values.json new file mode 100644 index 000000000..e3c9f6557 --- /dev/null +++ b/tests/data/data_dir_with_no_manifests/configuration/values.json @@ -0,0 +1,3 @@ +{ + "n_iterations": 5 +} diff --git a/tests/data/data_dir_with_no_manifests/input/values.json b/tests/data/data_dir_with_no_manifests/input/values.json new file mode 100644 index 000000000..58275dbd0 --- /dev/null +++ b/tests/data/data_dir_with_no_manifests/input/values.json @@ -0,0 +1,3 @@ +{ + "height": 3 +} diff --git a/octue/templates/template-python-fractal/README.py b/tests/mixins/__init__.py similarity index 100% rename from octue/templates/template-python-fractal/README.py rename to tests/mixins/__init__.py diff --git a/tests/mixins/test_base.py b/tests/mixins/test_base.py new file mode 100644 index 000000000..4b7bcfa65 --- /dev/null +++ b/tests/mixins/test_base.py @@ -0,0 +1,18 @@ +from octue.mixins import MixinBase +from ..base import BaseTestCase + + +class MixinBaseTestCase(BaseTestCase): + def test_instantiates_with_no_args(self): + """ Ensures the class instantiates without arguments + """ + MixinBase() + + def test_raises_exception_if_passed_args(self): + """ Ensures that the base mixin won't silently fail if passed arguments it's not supposed to have + """ + with self.assertRaises(TypeError): + MixinBase("an_argument") + + with self.assertRaises(TypeError): + MixinBase(an_argument="an_argument") diff --git a/tests/test_identifiable.py b/tests/mixins/test_identifiable.py similarity index 93% rename from tests/test_identifiable.py rename to tests/mixins/test_identifiable.py index f7d1454f7..a5ea4e510 100644 --- a/tests/test_identifiable.py +++ b/tests/mixins/test_identifiable.py @@ -2,7 +2,7 @@ from octue import exceptions from octue.mixins import Identifiable -from .base import BaseTestCase +from ..base import BaseTestCase class IdentifiableTestCase(BaseTestCase): @@ -74,7 +74,7 @@ class Inherit(Identifiable): pass resource = Inherit() - with self.assertRaises(exceptions.InvalidInputException) as e: + with self.assertRaises(AttributeError) as e: resource.id = "07d38e81-6b00-4079-901b-e250ea3c7773" - self.assertIn("You cannot set the id of an already-instantiated Inherit", e.exception.args[0]) + self.assertIn("can't set attribute", e.exception.args[0]) diff --git a/tests/test_loggable.py b/tests/mixins/test_loggable.py similarity index 94% rename from tests/test_loggable.py rename to tests/mixins/test_loggable.py index 4ca7b4edd..6732c4760 100644 --- a/tests/test_loggable.py +++ b/tests/mixins/test_loggable.py @@ -1,10 +1,7 @@ import logging from octue.mixins import Loggable -from .base import BaseTestCase - - -module_logger = logging.getLogger(__name__) +from ..base import BaseTestCase class InheritLoggable(Loggable): diff --git a/tests/mixins/test_pathable.py b/tests/mixins/test_pathable.py new file mode 100644 index 000000000..f7c31f364 --- /dev/null +++ b/tests/mixins/test_pathable.py @@ -0,0 +1,97 @@ +import os + +from octue.exceptions import InvalidInputException +from octue.mixins import MixinBase, Pathable +from ..base import BaseTestCase + + +class MyPathable(Pathable, MixinBase): + pass + + +class PathableTestCase(BaseTestCase): + def test_instantiates_with_no_args(self): + """ Ensures the class instantiates without arguments, with default paths at the current working directory + """ + resource = MyPathable() + self.assertEqual(os.getcwd(), resource.absolute_path) + self.assertEqual(".", resource.relative_path) + self.assertEqual(".", resource.path) + + def test_paths_chain(self): + """ Ensures that pathable resources daisychain their paths + """ + owner = MyPathable() + owned = MyPathable(path_from=owner, path="owned") + owned_owned = MyPathable(path_from=owned, path="owned_owned") + self.assertEqual(os.path.join(os.getcwd(), "owned", "owned_owned"), owned_owned.absolute_path) + + def test_paths_chain_dynamic(self): + """ Ensures that daisychaining updates if path in the chain changes + + This enables us to initialise (for example) datasets from manifests, where all the paths are given relative to + the dataset, then alter the path further up the tree to where a directory actually is on the system. + """ + owner = MyPathable() + owned = MyPathable(path_from=owner, path="owned") + self.assertEqual(os.path.join(os.getcwd(), "owned"), owned.absolute_path) + owner._path = "dynamic" + self.assertEqual(os.path.join(os.getcwd(), "dynamic", "owned"), owned.absolute_path) + + def test_paths_chain_with_missing_values(self): + """ Ensures that pathable resources chain even if a part of the chain doesn't have a path + """ + + # Owner is in the current working directory + owner = MyPathable(path="owner") + owned = MyPathable(path_from=owner) # This resource doesn't have a path property + owned_owned = MyPathable(path_from=owned, path="owned_owned") + self.assertEqual(os.path.join(os.getcwd(), "owner", "owned_owned"), owned_owned.absolute_path) + + def test_paths_relative(self): + """ Ensures that pathable resources have a relative path (by default relative to current working directory) + """ + owner = MyPathable(path="owner") + owned = MyPathable(path_from=owner, path="owned") + self.assertEqual(os.path.join("owner", "owned"), owned.relative_path) + + def test_paths_relative_to_base(self): + """ Ensures that pathable resources have a relative path that is relative to base_path if given + """ + # Check it works for a single depth + owner1 = MyPathable(path="owner") + owned1 = MyPathable(path_from=owner1, path="owned", base_from=owner1) + self.assertEqual(os.path.join("owned"), owned1.relative_path) + + # Check it works at from several depths + owner2 = MyPathable(path="owner") + owned2 = MyPathable(path_from=owner2, path="owned") + owned_owned2 = MyPathable(path_from=owned2, path="owned_owned", base_from=owner2) + self.assertEqual(os.path.join("owned", "owned_owned"), owned_owned2.relative_path) + + def test_invalid_base_and_path_from(self): + """ Ensures that exceptions are correctly raised when the *_from objects are not Pathables + """ + + class NotPathable: + pass + + with self.assertRaises(InvalidInputException): + MyPathable(path="owner", path_from=NotPathable()) + + with self.assertRaises(InvalidInputException): + MyPathable(path="owner", base_from=NotPathable()) + + def test_valid_absolute_path_without_from_path(self): + """ Ensures that an absolute path can be set if no from_path object is present + """ + owner1 = MyPathable(path="/owner") + self.assertEqual("/owner", owner1.absolute_path) + + def test_invalid_absolute_path_with_from_path(self): + """ Ensures that pathable resources have a relative path that is relative to base_path if given + """ + # Check it works for a single depth + owner1 = MyPathable(path="owner") + with self.assertRaises(InvalidInputException): + MyPathable(path_from=owner1, path="/owned") diff --git a/tests/test_serialisable.py b/tests/mixins/test_serialisable.py similarity index 61% rename from tests/test_serialisable.py rename to tests/mixins/test_serialisable.py index 18c29ac2b..73eadf388 100644 --- a/tests/test_serialisable.py +++ b/tests/mixins/test_serialisable.py @@ -4,7 +4,20 @@ from tempfile import TemporaryDirectory from octue.mixins import Serialisable -from .base import BaseTestCase +from ..base import BaseTestCase + + +class Inherit(Serialisable): + def __init__(self): + super().__init__() + self.id = "id" + self.logger = logging.getLogger("test_returns_primitive_without_logger_or_protected_fields") + self.field_to_serialise = 0 + self._field_not_to_serialise = 1 + + +class InheritWithFieldsToSerialise(Inherit): + _serialise_fields = ("field_to_serialise",) class SerialisableTestCase(BaseTestCase): @@ -25,15 +38,6 @@ def test_raises_attribute_error_with_missing_logger(self): def test_returns_primitive_without_logger_or_protected_fields(self): """ Ensures class instantiates with a UUID() """ - - class Inherit(Serialisable): - def __init__(self): - super().__init__() - self.id = "id" - self.logger = logging.getLogger("test_returns_primitive_without_logger_or_protected_fields") - self.field_to_serialise = 0 - self._field_not_to_serialise = 1 - resource = Inherit() serialised = resource.serialise() self.assertTrue("id" in serialised.keys()) @@ -44,18 +48,7 @@ def __init__(self): def test_serialise_only_attrs(self): """ Restricts the id field, which would normally be serialised """ - - class Inherit(Serialisable): - _serialise_fields = ("field_to_serialise",) - - def __init__(self): - super().__init__() - self.id = "id" - self.logger = logging.getLogger("test_returns_primitive_without_logger_or_protected_fields") - self.field_to_serialise = 0 - self._field_not_to_serialise = 1 - - resource = Inherit() + resource = InheritWithFieldsToSerialise() serialised = resource.serialise() self.assertFalse("id" in serialised.keys()) self.assertTrue("field_to_serialise" in serialised.keys()) @@ -65,18 +58,7 @@ def __init__(self): def test_serialise_to_string(self): """ Restricts the id field, which would normally be serialised """ - - class Inherit(Serialisable): - _serialise_fields = ("field_to_serialise",) - - def __init__(self): - super().__init__() - self.id = "id" - self.logger = logging.getLogger("test_returns_primitive_without_logger_or_protected_fields") - self.field_to_serialise = 0 - self._field_not_to_serialise = 1 - - resource = Inherit() + resource = InheritWithFieldsToSerialise() serialised = resource.serialise(to_string=True) self.assertIsInstance(serialised, str) @@ -84,14 +66,6 @@ def test_serialise_to_file(self): """ Restricts the id field, which would normally be serialised """ - class Inherit(Serialisable): - def __init__(self): - super().__init__() - self.id = "id" - self.logger = logging.getLogger("test_returns_primitive_without_logger_or_protected_fields") - self.field_to_serialise = 0 - self._field_not_to_serialise = 1 - with TemporaryDirectory() as dir_name: file_name = os.path.join(dir_name, "test_serialise_to_file.json") resource = Inherit() diff --git a/tests/test_taggable.py b/tests/mixins/test_taggable.py similarity index 87% rename from tests/test_taggable.py rename to tests/mixins/test_taggable.py index 362e5a3cc..15378df69 100644 --- a/tests/test_taggable.py +++ b/tests/mixins/test_taggable.py @@ -1,7 +1,11 @@ from octue import exceptions -from octue.mixins import Taggable +from octue.mixins import MixinBase, Taggable from octue.mixins.taggable import TagGroup -from .base import BaseTestCase +from ..base import BaseTestCase + + +class MyTaggable(Taggable, MixinBase): + pass class TaggableTestCase(BaseTestCase): @@ -13,21 +17,21 @@ def test_instantiates(self): def test_instantiates_with_tags(self): """ Ensures datafile inherits correctly from the Taggable class and passes arguments through """ - tgd = Taggable(tags="") + tgd = MyTaggable(tags="") self.assertEqual("", str(tgd.tags)) - tgd = Taggable(tags=None) + tgd = MyTaggable(tags=None) self.assertEqual("", str(tgd.tags)) - tgd = Taggable(tags="a b c") + tgd = MyTaggable(tags="a b c") self.assertEqual("a b c", str(tgd.tags)) with self.assertRaises(exceptions.InvalidTagException): - Taggable(tags=":a b c") + MyTaggable(tags=":a b c") def test_instantiates_with_tag_group(self): """ Ensures datafile inherits correctly from the Taggable class and passes arguments through """ - tgd = Taggable(tags="") + tgd = MyTaggable(tags="") self.assertIsInstance(tgd.tags, TagGroup) - tgd2 = Taggable(tags=tgd.tags) + tgd2 = MyTaggable(tags=tgd.tags) self.assertFalse(tgd is tgd2) def test_fails_to_instantiates_with_non_iterable(self): @@ -38,7 +42,7 @@ class NoIter: pass with self.assertRaises(exceptions.InvalidTagException) as error: - Taggable(tags=NoIter()) + MyTaggable(tags=NoIter()) self.assertIn( "Tags must be expressed as a whitespace-delimited string or an iterable of strings", error.exception.args[0] @@ -47,14 +51,14 @@ class NoIter: def test_reset_tags(self): """ Ensures datafile inherits correctly from the Taggable class and passes arguments through """ - tgd = Taggable(tags="a b") + tgd = MyTaggable(tags="a b") tgd.tags = "b c" self.assertEqual(str(tgd.tags), "b c") def test_valid_tags(self): """ Ensures valid tags do not raise an error """ - tgd = Taggable() + tgd = MyTaggable() tgd.add_tags("a-valid-tag") tgd.add_tags("a:tag") tgd.add_tags("a:-tag") # <--- yes, this is valid deliberately as it allows people to do negation @@ -70,7 +74,7 @@ def test_invalid_tags(self): """ Ensures invalid tags raise an error """ - tgd = Taggable() + tgd = MyTaggable() with self.assertRaises(exceptions.InvalidTagException): tgd.add_tags("-bah") @@ -89,7 +93,7 @@ def test_invalid_tags(self): def test_mixture_valid_invalid(self): """ Ensures that adding a variety of tags, some of which are invalid, doesn't partially add them to the object """ - tgd = Taggable() + tgd = MyTaggable() tgd.add_tags("first-valid-should-be-added") try: tgd.add_tags("second-valid-should-not-be-added-because", "-the-third-is-invalid:") diff --git a/tests/resources/__init__.py b/tests/resources/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_analysis.py b/tests/resources/test_analysis.py similarity index 97% rename from tests/test_analysis.py rename to tests/resources/test_analysis.py index de9454e9d..f2b8aaccc 100644 --- a/tests/test_analysis.py +++ b/tests/resources/test_analysis.py @@ -3,7 +3,7 @@ from octue import exceptions from octue.resources import Analysis from twined import Twine -from .base import BaseTestCase +from ..base import BaseTestCase class AnalysisTestCase(BaseTestCase): diff --git a/tests/resources/test_datafile.py b/tests/resources/test_datafile.py new file mode 100644 index 000000000..0979a983c --- /dev/null +++ b/tests/resources/test_datafile.py @@ -0,0 +1,97 @@ +import os +import uuid + +from octue import exceptions +from octue.mixins import MixinBase, Pathable +from octue.resources import Datafile +from ..base import BaseTestCase + + +class MyPathable(Pathable, MixinBase): + pass + + +class DatafileTestCase(BaseTestCase): + def setUp(self): + super().setUp() + self.path_from = MyPathable(path=os.path.join(self.data_path, "basic_files", "configuration", "test-dataset")) + self.path = os.path.join("path-within-dataset", "a_test_file.csv") + + def create_valid_datafile(self): + return Datafile(path_from=self.path_from, base_from=self.path_from, path=self.path, skip_checks=False) + + def test_instantiates(self): + """ Ensures a Datafile instantiates using only a path and generates a uuid ID + """ + df = Datafile(path="a_path") + self.assertTrue(isinstance(df.id, str)) + self.assertEqual(type(uuid.UUID(df.id)), uuid.UUID) + self.assertIsNone(df.sequence) + self.assertEqual(0, df.cluster) + + def test_path_argument_required(self): + """ Ensures instantiation without a path will fail + """ + with self.assertRaises(exceptions.InvalidInputException) as error: + Datafile() + + self.assertIn("You must supply a valid 'path' for a Datafile", error.exception.args[0]) + + def test_checks_fail_when_file_doesnt_exist(self): + path = "not_a_real_file.csv" + with self.assertRaises(exceptions.FileNotFoundException) as error: + Datafile(path=path, skip_checks=False) + self.assertIn("No file found at", error.exception.args[0]) + + def test_conflicting_extension_fails_check(self): + with self.assertRaises(exceptions.InvalidInputException) as error: + Datafile(path_from=self.path_from, path=self.path, skip_checks=False, extension="notcsv") + + self.assertIn("Extension provided (notcsv) does not match file extension", error.exception.args[0]) + + def test_file_attributes_accessible(self): + """ Ensures that its possible to set the sequence, cluster and timestamp + """ + df = self.create_valid_datafile() + self.assertIsInstance(df.size_bytes, int) + self.assertGreaterEqual(df.last_modified, 1598200190.5771205) + self.assertEqual("a_test_file.csv", df.name) + + df.sequence = 2 + df.cluster = 0 + df.posix_timestamp = 0 + + def test_cannot_set_calculated_file_attributes(self): + """ Ensures that calculated attributes cannot be set + """ + df = self.create_valid_datafile() + + with self.assertRaises(AttributeError): + df.size_bytes = 1 + + with self.assertRaises(AttributeError): + df.last_modified = 1000000000.5771205 + + def test_serialisable(self): + """ Ensures a datafile can serialise to json format + """ + df = self.create_valid_datafile() + df_dict = df.serialise() + + for k in df_dict.keys(): + self.assertFalse(k.startswith("_")) + + for k in ( + "cluster", + "extension", + "id", + "last_modified", + "name", + "path", + "posix_timestamp", + "sequence", + "size_bytes", + "tags", + "sha_256", + ): + self.assertIn(k, df_dict.keys()) diff --git a/tests/test_dataset.py b/tests/resources/test_dataset.py similarity index 86% rename from tests/test_dataset.py rename to tests/resources/test_dataset.py index 9ec7713bc..d563be91f 100644 --- a/tests/test_dataset.py +++ b/tests/resources/test_dataset.py @@ -1,6 +1,6 @@ from octue import exceptions from octue.resources import Datafile, Dataset -from .base import BaseTestCase +from ..base import BaseTestCase class DatafileTestCase(BaseTestCase): @@ -62,11 +62,6 @@ class NotADatafile: self.assertIn("must be of class Datafile to append it to a Dataset", e.exception.args[0]) - def test_get_files_daisychains(Self): - """ Ensures that the output of get_files can be reused to filter down further - """ - pass - def test_get_files_catches_single_underscore_mistake(self): """ Ensures that if the field name is a single underscore, that gets caught as an error """ @@ -153,17 +148,16 @@ def test_get_files_by_tag(self): def test_get_file_by_tag(self): """ Ensures that get_files works with tag lookups """ - resource = Dataset( - files=[ - Datafile(path="path-within-dataset/a_my_file.csv", tags="one a:2 b:3 all"), - Datafile(path="path-within-dataset/a_your_file.csv", tags="two a:2 b:3 all"), - Datafile(path="path-within-dataset/a_your_file.csv", tags="three all"), - ] - ) + files = [ + Datafile(path="path-within-dataset/a_my_file.csv", tags="one a:2 b:3 all"), + Datafile(path="path-within-dataset/a_your_file.csv", tags="two a:2 b:3 all"), + Datafile(path="path-within-dataset/a_your_file.csv", tags="three all"), + ] + + resource = Dataset(files=files) # Check working for single result - df = resource.get_file_by_tag("three") - self.assertTrue(isinstance(df, Datafile)) + self.assertIs(resource.get_file_by_tag("three"), files[2]) # Check raises for too many results with self.assertRaises(exceptions.UnexpectedNumberOfResultsException) as e: @@ -193,15 +187,14 @@ def test_get_files_by_sequence_notnone(self): def test_get_file_sequence(self): """ Ensures that get_files works with sequence lookups """ - resource = Dataset( - files=[ - Datafile(path="path-within-dataset/a_my_file.csv", sequence=0), - Datafile(path="path-within-dataset/a_your_file.csv", sequence=1), - Datafile(path="path-within-dataset/a_your_file.csv", sequence=None), - ] - ) - files = resource.get_file_sequence("name__endswith", filter_value=".csv", strict=True) - self.assertEqual(2, len(files)) + files = [ + Datafile(path="path-within-dataset/a_my_file.csv", sequence=0), + Datafile(path="path-within-dataset/a_your_file.csv", sequence=1), + Datafile(path="path-within-dataset/a_your_file.csv", sequence=None), + ] + + got_files = Dataset(files=files).get_file_sequence("name__endswith", filter_value=".csv", strict=True) + self.assertEqual(got_files, files[:2]) def test_get_broken_file_sequence(self): """ Ensures that get_files works with sequence lookups @@ -219,14 +212,12 @@ def test_get_broken_file_sequence(self): def test_get_files_name_filters_include_extension(self): """ Ensures that filters applied to the name will catch terms in the extension """ - resource = Dataset( - files=[ - Datafile(path="path-within-dataset/a_test_file.csv"), - Datafile(path="path-within-dataset/a_test_file.txt"), - ] - ) - files = resource.get_files("name__icontains", filter_value="txt") - self.assertEqual(1, len(files)) + files = [ + Datafile(path="path-within-dataset/a_test_file.csv"), + Datafile(path="path-within-dataset/a_test_file.txt"), + ] + + self.assertEqual(Dataset(files=files).get_files("name__icontains", filter_value="txt"), [files[1]]) def test_get_files_name_filters_exclude_path(self): """ Ensures that filters applied to the name will not catch terms in the extension diff --git a/tests/templates/__init__.py b/tests/templates/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_template_apps.py b/tests/templates/test_template_apps.py similarity index 64% rename from tests/test_template_apps.py rename to tests/templates/test_template_apps.py index ce424e907..7b19fe01c 100644 --- a/tests/test_template_apps.py +++ b/tests/templates/test_template_apps.py @@ -4,7 +4,7 @@ import uuid from octue import Runner -from .base import BaseTestCase +from ..base import BaseTestCase class TemplateAppsTestCase(BaseTestCase): @@ -15,10 +15,11 @@ def setUp(self): super().setUp() self.start_path = os.getcwd() - # Initialise just so that pylint picks up these variables are present (reinitialised in set_template()) + # Initialise so these variables are assigned on the instance self.template_data_path = None self.template_twine = None self.template_path = None + self.app_test_path = None self.teardown_templates = [] def set_template(template): @@ -57,4 +58,22 @@ def test_fractal_configuration(self): twine=self.template_twine, configuration_values=os.path.join("data", "configuration", "configuration_values.json"), ) - runner.run(app_src=".") + analysis = runner.run( + app_src=self.template_path, output_manifest_path=os.path.join("data", "output", "manifest.json") + ) + analysis.finalise(output_dir=os.path.join("data", "output")) + + def test_using_manifests(self): + """ Ensures using-manifests app works correctly + """ + self.set_template("template-using-manifests") + runner = Runner( + twine=self.template_twine, configuration_values=os.path.join("data", "configuration", "values.json"), + ) + analysis = runner.run( + app_src=self.template_path, + input_manifest=os.path.join("data", "input", "manifest.json"), + output_manifest_path=os.path.join("data", "output", "manifest.json"), + ) + analysis.finalise(output_dir=os.path.join("data", "output")) + self.assertTrue(os.path.isfile(os.path.join("data", "output", "cleaned_met_mast_data", "cleaned.csv"))) diff --git a/tests/test_templates.py b/tests/templates/test_templates.py similarity index 92% rename from tests/test_templates.py rename to tests/templates/test_templates.py index ecba2b46b..b369f9cb4 100644 --- a/tests/test_templates.py +++ b/tests/templates/test_templates.py @@ -4,7 +4,7 @@ from octue import exceptions from octue.templates import copy_template -from .base import BaseTestCase +from ..base import BaseTestCase class TemplatesTestCase(BaseTestCase): @@ -42,7 +42,7 @@ def test_copy_templates_to_current_directory_by_default(self): def test_copy_template_raises_if_unknown(self): """ Ensures that a known template will copy to a given directory """ - with TemporaryDirectory() as dir: + with TemporaryDirectory() as tmp_dir: with self.assertRaises(exceptions.InvalidInputException) as error: - copy_template("template-which-isnt-there", dir) + copy_template("template-which-isnt-there", tmp_dir) self.assertIn("Unknown template name 'template-which-isnt-there', try one of", error.exception.args[0]) diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 000000000..4538e5798 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,54 @@ +import os +from click.testing import CliRunner +from tests import TESTS_DIR +from tests.app import CUSTOM_APP_RUN_MESSAGE +from tests.base import BaseTestCase + +from octue.cli import octue_cli + + +class RunnerTestCase(BaseTestCase): + + TWINE_FILE_PATH = os.path.join(TESTS_DIR, "data", "twines", "valid_schema_twine.json") + + def test_version(self): + """Ensure the version command works in the CLI.""" + result = CliRunner().invoke(octue_cli, ["--version"]) + assert "version" in result.output + + def test_help(self): + """Ensure the help commands works in the CLI.""" + help_result = CliRunner().invoke(octue_cli, ["--help"]) + assert help_result.output.startswith("Usage") + + h_result = CliRunner().invoke(octue_cli, ["-h"]) + assert help_result.output == h_result.output + + def test_run_command_can_be_added(self): + """Test that an arbitrary run command can be used in the run command of the CLI.""" + result = CliRunner().invoke( + octue_cli, + [ + "run", + f"--app-dir={TESTS_DIR}", + f"--twine={self.TWINE_FILE_PATH}", + f'--config-dir={os.path.join(TESTS_DIR, "data", "data_dir_with_no_manifests", "configuration")}', + f'--input-dir={os.path.join(TESTS_DIR, "data", "data_dir_with_no_manifests", "input")}', + ], + ) + + assert CUSTOM_APP_RUN_MESSAGE in result.output + + def test_run_command_works_with_data_dir(self): + """Test that the run command of the CLI works with the --data-dir option.""" + result = CliRunner().invoke( + octue_cli, + [ + "run", + f"--app-dir={TESTS_DIR}", + f"--twine={self.TWINE_FILE_PATH}", + f'--data-dir={os.path.join(TESTS_DIR, "data", "data_dir_with_no_manifests")}', + ], + ) + + assert CUSTOM_APP_RUN_MESSAGE in result.output diff --git a/tests/test_datafile.py b/tests/test_datafile.py deleted file mode 100644 index 5d8a02236..000000000 --- a/tests/test_datafile.py +++ /dev/null @@ -1,124 +0,0 @@ -import os -import uuid - -from octue import exceptions -from octue.resources import Datafile -from .base import BaseTestCase - - -class DatafileTestCase(BaseTestCase): - def test_instantiates(self): - """ Ensures a Datafile instantiates using only a path and generates a uuid ID - """ - df = Datafile(path="a_path") - self.assertTrue(isinstance(df.id, str)) - uuid.UUID(df.id) - self.assertIsNone(df.sequence) - self.assertEqual(0, df.cluster) - - def test_path_argument_required(self): - """ Ensures instantiation without a path will fail - """ - with self.assertRaises(exceptions.InvalidInputException) as error: - Datafile() - - self.assertIn("You must supply a valid 'path' argument", error.exception.args[0]) - - def test_default_local_path_prefix(self): - """ Ensures the local path, by default, is set to the current working directory - """ - df = Datafile(path="/path-within-dataset/a_test_file.csv", skip_checks=True) - self.assertEqual("path-within-dataset/a_test_file.csv", df.path) - self.assertEqual(str(os.getcwd()), df.local_path_prefix.rstrip("\\/")) - - def test_local_path_prefix_with_checks(self): - """ Ensures the local path can be specified and that checks pass on instantiation - """ - local_path = os.path.join(self.data_path, "basic_files/configuration/test-dataset") - df = Datafile(path="path-within-dataset/a_test_file.csv", local_path_prefix=local_path, skip_checks=False) - self.assertEqual("csv", df.extension) - - def test_leading_slashes_on_path_are_stripped(self): - """ Ensures the path is always relative - """ - df = Datafile(path="/path-within-dataset/a_test_file.csv", skip_checks=True) - self.assertEqual("path-within-dataset/a_test_file.csv", df.path) - - df = Datafile(path="path-within-dataset/a_test_file.csv", skip_checks=True) - self.assertEqual("path-within-dataset/a_test_file.csv", df.path) - - def test_checks_pass_when_file_exists(self): - local_path_prefix = os.path.join(self.data_path, "basic_files/configuration/test-dataset") - path = "path-within-dataset/a_test_file.csv" - df = Datafile(local_path_prefix=local_path_prefix, path=path, skip_checks=False) - self.assertEqual("path-within-dataset/a_test_file.csv", df.path) - self.assertEqual(os.path.join(local_path_prefix, path), df.full_path) - self.assertEqual("csv", df.extension) - - def test_checks_fail_when_file_doesnt_exist(self): - - path = "not_a_real_file.csv" - with self.assertRaises(exceptions.FileNotFoundException) as error: - Datafile(path=path, skip_checks=False) - self.assertIn("No file found at", error.exception.args[0]) - - def test_conflicting_extension_fails_check(self): - local_path_prefix = os.path.join(self.data_path, "basic_files/configuration/test-dataset") - path = "path-within-dataset/a_test_file.csv" - with self.assertRaises(exceptions.InvalidInputException) as error: - Datafile(local_path_prefix=local_path_prefix, path=path, skip_checks=False, extension="notcsv") - - self.assertIn("Extension provided (notcsv) does not match file extension", error.exception.args[0]) - - def test_file_attributes_accessible(self): - """ Ensures that its possible to set the sequence, cluster and timestamp - """ - local_path_prefix = os.path.join(self.data_path, "basic_files/configuration/test-dataset") - path = "path-within-dataset/a_test_file.csv" - df = Datafile(local_path_prefix=local_path_prefix, path=path, skip_checks=False) - self.assertIsInstance(df.size_bytes, int) - self.assertGreaterEqual(df.last_modified, 1598200190.5771205) - self.assertEqual("a_test_file.csv", df.name) - - df.sequence = 2 - df.cluster = 0 - df.posix_timestamp = 0 - - def test_cannot_set_calculated_file_attributes(self): - """ Ensures that calculated attributes cannot be set - """ - local_path_prefix = os.path.join(self.data_path, "basic_files/configuration/test-dataset") - path = "path-within-dataset/a_test_file.csv" - df = Datafile(local_path_prefix=local_path_prefix, path=path, skip_checks=False) - - with self.assertRaises(AttributeError): - df.size_bytes = 1 - - with self.assertRaises(AttributeError): - df.last_modified = 1000000000.5771205 - - def test_serialisable(self): - """ Ensures a datafile can serialise to json format - """ - local_path_prefix = os.path.join(self.data_path, "basic_files/configuration/test-dataset") - path = "path-within-dataset/a_test_file.csv" - df = Datafile(local_path_prefix=local_path_prefix, path=path, skip_checks=False) - df_dict = df.serialise() - - for k in df_dict.keys(): - self.assertFalse(k.startswith("_")) - - for k in ( - "cluster", - "extension", - "id", - "last_modified", - "name", - "path", - "posix_timestamp", - "sequence", - "size_bytes", - "tags", - "sha_256", - ): - self.assertIn(k, df_dict.keys()) diff --git a/tests/test_runner.py b/tests/test_runner.py index 22d1407f6..5531ef068 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -76,7 +76,7 @@ def fcn(analysis): pass with self.assertRaises(twined.exceptions.InvalidValuesContents) as error: - runner.run(fcn) + runner.run(fcn).finalise() self.assertIn("'n_iterations' is a required property", error.exception.args[0]) @@ -84,20 +84,7 @@ def fcn(analysis): def fcn(analysis): analysis.output_values["n_iterations"] = 10 - runner.run(fcn) - - def test_exception_raised_when_extra_strand_data_present(self): - """ Ensures that protected attributes can't be set - """ - with self.assertRaises(twined.exceptions.StrandNotFound) as error: - Runner( - twine="{}", configuration_values={}, - ) - - self.assertIn( - "Source data is provided for 'configuration_values' but no such strand is defined in the twine", - error.exception.args[0], - ) + runner.run(fcn).finalise() def test_exception_raised_when_strand_data_missing(self): """ Ensures that protected attributes can't be set diff --git a/tox.ini b/tox.ini index d9ed668d8..03acdf62a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = {py36,py37,p38},py36-flake8 +envlist = {py36,py37,py38},py36-flake8 [testenv] setenv =