diff --git a/.github/actions/e2e/action.yml b/.github/actions/e2e/action.yml new file mode 100644 index 0000000..eec373c --- /dev/null +++ b/.github/actions/e2e/action.yml @@ -0,0 +1,26 @@ +--- +name: Test E2E +description: Run e2e tests +inputs: + working_directory: + description: "Working directory" + required: false + default: . +runs: + using: "composite" + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + repository: neptune-ai/neptune-mlflow + path: ${{ inputs.working_directory }} + + - name: Install dependencies + working-directory: ${{ inputs.working_directory }} + run: pip install -e .[dev] + shell: bash + + - name: Run tests + working-directory: ${{ inputs.working_directory }} + run: pytest -v tests/e2e + shell: bash diff --git a/.github/actions/unit/action.yml b/.github/actions/unit/action.yml new file mode 100644 index 0000000..4fe4520 --- /dev/null +++ b/.github/actions/unit/action.yml @@ -0,0 +1,26 @@ +--- +name: Unit test +description: Run unit tests +inputs: + working_directory: + description: "Working directory" + required: false + default: . +runs: + using: "composite" + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + repository: neptune-ai/neptune-mlflow + path: ${{ inputs.working_directory }} + + - name: Install dependencies + working-directory: ${{ inputs.working_directory }} + run: pip install -e .[dev] + shell: bash + + - name: Run tests + working-directory: ${{ inputs.working_directory }} + run: pytest -v tests/unit + shell: bash diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..adaffd7 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,79 @@ +name: neptune-mlflow +on: [push] +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: Install dependencies + run: | + pip install -e .[dev] + + - name: Pre-commit + run: | + pre-commit run --show-diff-on-failure --color=always + + e2e: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: [3.9] + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Test + env: + NEPTUNE_API_TOKEN: ${{ secrets.E2E_NEPTUNE_API_TOKEN }} + NEPTUNE_PROJECT: e2e-tests/e2e + uses: ./.github/actions/e2e + + unit: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ ubuntu-latest, macos-latest, windows-latest ] + python-version: [ 3.9 ] + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Test + uses: ./.github/actions/unit + + publish: + needs: [pre-commit, e2e, unit] + runs-on: ubuntu-latest + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: Install build dependencies + run: pip install poetry poetry-dynamic-versioning + + - name: Build package + run : | + poetry build + + - name: Publish package + uses: pypa/gh-action-pypi-publish@v1.5.1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} + packages_dir: dist diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml deleted file mode 100644 index 9c0cc55..0000000 --- a/.github/workflows/pre-commit.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: pre-commit - -on: push - -jobs: - pre-commit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - - name: Install dependencies - run: | - pip install -e .[dev] - - name: pre-commit run - uses: pre-commit/action@v3.0.0 diff --git a/.gitignore b/.gitignore index 5147813..22b1db1 100644 --- a/.gitignore +++ b/.gitignore @@ -107,3 +107,10 @@ venv.bak/ # Pycharm .idea/ + +# MLflow +mlruns/ +test_tracking_uri/ + +# Neptune +.neptune/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2912e6a..9e40089 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/pycqa/isort - rev: 5.10.1 + rev: 5.12.0 hooks: - id: isort args: [--settings-path, pyproject.toml] diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 708f440..0000000 --- a/.travis.yml +++ /dev/null @@ -1,20 +0,0 @@ -sudo: false -language: python -python: -- '2.7' -- '3.5' -- '3.6' -install: -- make prepare -- pip install tox-travis -script: -- make tests -deploy: - distributions: git_version sdist - provider: pypi - user: neptune-deployer - password: - secure: Ss31M/pvWgwy3Gjeg2/c/KJqw/Ma5ofmIRDg4BB5zgF73bVxLbdKl8tmBnn+sRQywHzoxMNsMVSVJveUxkHwAgpqGUbb7KEYyZUsmBzkISi5lCTcTWqr2LxUtKk3upaZns54RurHLeyIjXf2ttSBsBYbj0FDaQeD+C7uzCnKZCOYridu+0nD39CghdR1+yVQY/4M1viglASa9sQEwgEBeGqiBWoJNeWX1KOMpWkRnzgMSv2w1AY5Lmq+vWK7NBm7Xdk15kVofdAB/zQdtFtkyaxN9gaEhoXLPU50oxj+UQXwb7AyhFg4fxlUyzz1GpeYFPgHeJlIADrKAyQzzd4vhAJJO0Gv1z4hAjJ/tTFIK4TChdv4FCDi6UGinWbq6pZRIuz540hLV9VMdxeYKcUvGO14dHCtIQ0CD3NK9iqUmV07pAUlyiKWBgQbEISFUkZ63/kr0d8cm3JQfG+fxxg6GAIqCFKfD5Gmy/c1qZNf9Oy0IjEuK8ywRbEPo3t3aiJf2u5E3LFSNjWuqZWpRTIBoFtaq7jkecpDWz6j6FKKM/zy8CJvOfbmFQGu4el3Z5RWDdXjT1nB49waGvbItFiFOR0cOT7gDSdsbOvOFExuefJ8m7uOQB9oAqo4e06yNxLXpQSFBBnHLzoBb8/VfA2PXNAeazpls3U8uKJsjP9LAys= - on: - tags: true - skip_existing: true diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..c89089b --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,4 @@ +## [UNRELEASED] neptune-mlflow 1.0.0 + +### Changes +- Updated MLflow exporter to work with the current neptune.ai API ([#40](https://github.com/neptune-ai/neptune-mlflow/pull/40)) diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index d3490a2..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include requirements.txt -include VERSION -include git_version.py diff --git a/Makefile b/Makefile deleted file mode 100644 index 62dba31..0000000 --- a/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -clean: - rm -fr .tox/ dist/ VERSION - -prepare: - pip install -r requirements.txt -r test_requirements.txt - -build: - python setup.py git_version sdist - -tests: unit_tests - -unit_tests: - tox diff --git a/docs_requirements.txt b/docs_requirements.txt deleted file mode 100644 index 81dcf43..0000000 --- a/docs_requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -sphinx-rtd-theme==0.4.3 -sphinx==1.8.5 -nbsphinx==0.4.2 diff --git a/git_version.py b/git_version.py deleted file mode 100644 index 81a818e..0000000 --- a/git_version.py +++ /dev/null @@ -1,58 +0,0 @@ -# Original source: https://github.com/Changaco/version.py -import os -import re -from distutils.cmd import Command -from subprocess import ( - CalledProcessError, - call, - check_output, -) - -PREFIX = "" -INITIAL_VERSION = "0.0.0" - -tag_re = re.compile(r"\btag: %s([0-9][^,]*)\b" % PREFIX) -version_re = re.compile("^Version: (.+)$", re.M) - - -def get_git_version(): - if "VERSION" in os.environ: - return os.environ.get("VERSION") - - # Return the version if it has been injected into the file by git-archive - version = tag_re.search("$Format:%D$") - if version: - return version.group(1) - - # Get the version using "git describe". - try: - cmd = "git describe --tags --match %s[0-9.]* --dirty" % PREFIX - version = check_output(cmd.split()).decode().strip()[len(PREFIX) :] - except CalledProcessError: - version = INITIAL_VERSION + "+" + check_output("git rev-parse HEAD".split()).decode().strip()[:10] - if call("git diff --quiet".split()) != 0: - version += ".dirty" - - # PEP 440 compatibility - if "-" in version: - version_tokens = version.split("-") - version = version_tokens[0] + "+" + ".".join(version_tokens[1:]) - - return version - - -class GitVersion(Command): - user_options = [] - - def initialize_options(self): - pass - - def finalize_options(self): - pass - - def run(self): - version = get_git_version() - self.distribution.metadata.version = version - - with open("VERSION", "w") as version_file: - version_file.write(version) diff --git a/pyproject.toml b/pyproject.toml index 82fc62a..5c3d8eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,77 @@ +[build-system] +requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"] +build-backend = "poetry_dynamic_versioning.backend" + +[tool.poetry-dynamic-versioning] +enable = true +vcs = "git" +style = "semver" +pattern = "default-unprefixed" + +[tool.poetry.dependencies] +python = "^3.7" + +# Python lack of functionalities from future versions +importlib-metadata = { version = "*", python = "<3.8" } + +mlflow = ">=1.24.0" +# dev +pre-commit = { version = "*", optional = true } +pytest = { version = ">=5.0", optional = true } +pytest-cov = { version = "2.10.1", optional = true } +neptune = { version = ">=1.0.0", optional = true } +tensorflow = { version = ">2.0.0", optional = true } + +[tool.poetry.extras] +dev = [ + "pre-commit", + "pytest", + "pytest-cov", + "neptune", + "tensorflow", +] + +[tool.poetry] +authors = ["neptune.ai "] +description = "neptune.ai MLflow integration library" +repository = "https://github.com/neptune-ai/neptune-mlflow" +homepage = "https://neptune.ai/" +documentation = "https://docs.neptune.ai/integrations/mlflow/" +include = ["CHANGELOG.md"] +license = "Apache License 2.0" +name = "neptune-mlflow" +readme = "README.md" +version = "0.0.0" +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Natural Language :: English", + "Operating System :: MacOS", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: Unix", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python :: Implementation :: CPython", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +keywords = [ + "MLOps", + "ML Experiment Tracking", + "ML Model Registry", + "ML Model Store", + "ML Metadata Store", +] +packages = [ + { include = "neptune_mlflow_exporter", from = "src" }, +] + +[tool.poetry.urls] +"Tracker" = "https://github.com/neptune-ai/neptune-mlflow/issues" +"Documentation" = "https://docs.neptune.ai/integrations/mlflow/" + [tool.black] line-length = 120 target-version = ['py37', 'py38', 'py39', 'py310'] @@ -21,3 +95,6 @@ force_grid_wrap = 2 [tool.flake8] max-line-length = 120 extend-ignore = "E203" + +[tool.poetry.plugins."neptune.plugins"] +"mlflow" = "neptune_mlflow_exporter:sync" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 0abe348..0000000 --- a/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -click>=7.0 -future>=0.17.1 -mlflow>=1.0.0 -neptune-client>=0.2.1 -pandas -path.py diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 5e40900..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[wheel] -universal = 1 diff --git a/setup.py b/setup.py deleted file mode 100644 index 60b5168..0000000 --- a/setup.py +++ /dev/null @@ -1,82 +0,0 @@ -# -# Copyright (c) 2020, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os - -from setuptools import ( - find_packages, - setup, -) - -import git_version - - -def version(): - try: - with open("VERSION") as f: - return f.readline().strip() - except IOError: - return "0.0.0" - - -def main(): - root_dir = os.path.dirname(__file__) - - with open(os.path.join(root_dir, "requirements.txt")) as f: - requirements = [r.strip() for r in f] - setup( - name="neptune-mlflow", - version=version(), - url="https://github.com/neptune-ai/neptune-mlflow", - license="Apache License 2.0", - author="neptune.ai", - author_email="contact@neptune.ai", - description="Neptune MLFlow", - long_description=__doc__, - packages=find_packages(where="src"), - platforms="any", - install_requires=requirements, - entry_points={"neptune.plugins": "mlflow = neptune_mlflow_plugin:sync"}, - cmdclass={ - "git_version": git_version.GitVersion, - }, - classifiers=[ - # As from http://pypi.python.org/pypi?%3Aaction=list_classifiers - # 'Development Status :: 1 - Planning', - # 'Development Status :: 2 - Pre-Alpha', - # 'Development Status :: 3 - Alpha', - "Development Status :: 4 - Beta", - # 'Development Status :: 5 - Production/Stable', - # 'Development Status :: 6 - Mature', - # 'Development Status :: 7 - Inactive', - "Environment :: Console", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Operating System :: POSIX", - "Operating System :: MacOS", - "Operating System :: Unix", - "Operating System :: Microsoft :: Windows", - "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 3", - "Topic :: Software Development :: Libraries :: Python Modules", - ], - package_dir={"": "src"}, - ) - - -if __name__ == "__main__": - main() diff --git a/src/neptune_mlflow/data_loader.py b/src/neptune_mlflow/data_loader.py deleted file mode 100644 index 8baa287..0000000 --- a/src/neptune_mlflow/data_loader.py +++ /dev/null @@ -1,144 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from __future__ import print_function - -import os -import re -from urllib.parse import urlparse -from urllib.request import url2pathname - -import click -import mlflow -import path as path_utils - - -class DataLoader(object): - - MLFLOW_EXPERIMENT_ID_PROPERTY = "mlflow/experiment/id" - MLFLOW_EXPERIMENT_NAME_PROPERTY = "mlflow/experiment/name" - MLFLOW_RUN_ID_PROPERTY = "mlflow/run/uuid" - MLFLOW_RUN_NAME_PROPERTY = "mlflow/run/name" - - def __init__(self, project, path): - self._project = project - self._path = path - - def run(self): - with path_utils.Path(self._path): - mlflow_client = mlflow.tracking.MlflowClient() - experiments = mlflow_client.list_experiments() - - for experiment in experiments: - run_infos = mlflow_client.list_run_infos(experiment_id=experiment.experiment_id) - existing_experiments = self._project.get_experiments(tag=DataLoader._to_proper_tag(experiment.name)) - existing_run_uuids = set( - [str(e.get_properties().get(DataLoader.MLFLOW_RUN_ID_PROPERTY)) for e in existing_experiments] - ) - - for run_info in run_infos: - run_qualified_name = self._get_run_qualified_name(experiment, run_info) - if run_info.run_uuid not in existing_run_uuids: - click.echo("Loading run {}".format(run_qualified_name)) - run = mlflow_client.get_run(run_info.run_uuid) - exp_uuid = self._create_neptune_experiment(experiment, run) - click.echo("Run {} was saved as {}".format(run_qualified_name, exp_uuid)) - else: - click.echo("Ignoring run {} since it already exists".format(run_qualified_name)) - - def _create_neptune_experiment(self, experiment, run): - with self._project.create_experiment( - name=self._get_name_for_experiment(experiment), - params=self._get_params(run), - properties=self._get_properties(experiment, run), - tags=self._get_tags(experiment, run), - upload_source_files=[], - abort_callback=lambda *args: None, - upload_stdout=False, - upload_stderr=False, - send_hardware_metrics=False, - run_monitoring_thread=False, - handle_uncaught_exceptions=True, - ) as neptune_exp: - if run.info.artifact_uri.startswith("file:/"): - artifacts_path = url2pathname(urlparse(run.info.artifact_uri).path) - with path_utils.Path(artifacts_path): - for artifact in os.listdir(artifacts_path): - neptune_exp.send_artifact(artifact) - else: - click.echo( - "WARNING: Remote artifacts are not supported and won't be uploaded (artifact_uri: {}).".format( - run.info.artifact_uri - ) - ) - - for metric_key in run.data.metrics.keys(): - self._create_metric(neptune_exp, experiment, run, metric_key) - - return neptune_exp.id - - @staticmethod - def _create_metric(neptune_exp, experiment, run, metric_key): - with open(DataLoader._get_metric_file(experiment, run.info, metric_key)) as f: - for idx, line in enumerate(f, start=1): - value = float(line.split()[1]) - neptune_exp.send_metric(metric_key, idx, value) - - @staticmethod - def _get_params(run): - params = {} - for key, value in run.data.params.items(): - params[key] = value - return params - - @staticmethod - def _get_properties(experiment, run): - properties = { - DataLoader.MLFLOW_EXPERIMENT_ID_PROPERTY: str(experiment.experiment_id), - DataLoader.MLFLOW_EXPERIMENT_NAME_PROPERTY: experiment.name, - DataLoader.MLFLOW_RUN_ID_PROPERTY: run.info.run_uuid, - DataLoader.MLFLOW_RUN_NAME_PROPERTY: DataLoader._get_mlflow_run_name(run) or "", - } - for key, value in run.data.tags.items(): - properties[key] = value - return properties - - @staticmethod - def _get_tags(experiment, run): - tags = [DataLoader._to_proper_tag(experiment.name), "mlflow"] - if DataLoader._get_mlflow_run_name(run): - tags.append(DataLoader._to_proper_tag(DataLoader._get_mlflow_run_name(run))) - return tags - - @staticmethod - def _to_proper_tag(string): - return re.sub("[^a-zA-Z0-9\\-_]", "_", string).lower() - - @staticmethod - def _get_metric_file(experiment, run_info, metric_key): - return "mlruns/{}/{}/metrics/{}".format(experiment.experiment_id, run_info.run_uuid, metric_key) - - @staticmethod - def _get_name_for_experiment(experiment): - return experiment.name or "experiment-{}".format(experiment.experiment_id) - - @staticmethod - def _get_run_qualified_name(experiment, run_info): - exp_name = DataLoader._get_name_for_experiment(experiment) - return "{}/{}".format(exp_name, run_info.run_id) - - @staticmethod - def _get_mlflow_run_name(run): - return run.data.tags.get("mlflow.runName", None) diff --git a/src/neptune_mlflow/sync.py b/src/neptune_mlflow/sync.py deleted file mode 100644 index bb9340b..0000000 --- a/src/neptune_mlflow/sync.py +++ /dev/null @@ -1,45 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import sys - -import click -import neptune - -from neptune_mlflow.data_loader import DataLoader - - -def sync(path, project): - if path is None: - path = "." - - if not os.path.exists(path): - click.echo("ERROR: Directory `{}` doesn't exist".format(path), err=True) - sys.exit(1) - - if not os.path.isdir(path): - click.echo("ERROR: `{}` is not a directory".format(path), err=True) - sys.exit(1) - - if not os.path.exists(os.path.join(path, "mlruns")): - click.echo("ERROR: No 'mlruns' directory in {}".format(path), err=True) - sys.exit(1) - - project = neptune.init(project_qualified_name=project) - - loader = DataLoader(project, path) - loader.run() diff --git a/src/neptune_mlflow_exporter/__init__.py b/src/neptune_mlflow_exporter/__init__.py new file mode 100644 index 0000000..94eb5da --- /dev/null +++ b/src/neptune_mlflow_exporter/__init__.py @@ -0,0 +1,73 @@ +# +# Copyright (c) 2023, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from typing import Optional + +import click + + +@click.command("mlflow") +@click.option("--project", "-p", help="Neptune project name", required=False, type=str) +@click.option("--api-token", "-a", help="Your Neptune API Key", required=False, type=str) +@click.option("--mlflow-tracking-uri", "-u", help="Your MLflow tracking URI", required=False, type=str) +@click.option( + "--exclude-artifacts", + "-e", + help="Specifies whether to also include artifacts in the upload", + required=False, + default=False, + type=bool, +) +@click.option( + "--max-artifact-size", + "-m", + help="Maximum size uploaded to Neptune, unit is in MB", + required=False, + default=50, + type=int, +) +def sync( + *, + project: Optional[str], + api_token: Optional[str], + mlflow_tracking_uri: Optional[str], + exclude_artifacts: bool, + max_artifact_size: int, +) -> None: + """Exports MLflow runs to neptune.ai. + + Args: + project: name of the Neptune project where your MLflow runs will go. + If not provided, NEPTUNE_PROJECT env variable will be used. + api_token: your Neptune API token. + If not provided, NEPTUNE_API_TOKEN env variable will be used. + mlflow_tracking_uri: your MLflow tracking URI. + If not provided, it is left to the MLflow client to resolve it. + exclude_artifacts: whether to skip uploading artifacts to Neptune. + max_artifact_size: max size of the artifact to be uploaded to Neptune. + Unit is in MB. + For directories this will be treated as the max size of the entire directory. + """ + + # We do not want to import anything if process was executed for autocompletion purposes. + from neptune_mlflow_exporter.impl.sync import sync as run_sync + + run_sync( + project_name=project, + api_token=api_token, + mlflow_tracking_uri=mlflow_tracking_uri, + exclude_artifacts=exclude_artifacts, + max_artifact_size=max_artifact_size, + ) diff --git a/tests/neptune_mlflow/__init__.py b/src/neptune_mlflow_exporter/impl/__init__.py similarity index 70% rename from tests/neptune_mlflow/__init__.py rename to src/neptune_mlflow_exporter/impl/__init__.py index 62a86a5..44fe730 100644 --- a/tests/neptune_mlflow/__init__.py +++ b/src/neptune_mlflow_exporter/impl/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, Neptune Labs Sp. z o.o. +# Copyright (c) 2023, Neptune Labs Sp. z o.o. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,3 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # + +__all__ = ["NeptuneExporter", "__version__"] + +from neptune_mlflow_exporter.impl.neptune_exporter import NeptuneExporter +from neptune_mlflow_exporter.impl.version import __version__ diff --git a/src/neptune_mlflow_exporter/impl/artifact_strategy.py b/src/neptune_mlflow_exporter/impl/artifact_strategy.py new file mode 100644 index 0000000..53bf9b7 --- /dev/null +++ b/src/neptune_mlflow_exporter/impl/artifact_strategy.py @@ -0,0 +1,112 @@ +# +# Copyright (c) 2023, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +__all__ = [ + "FileUploadStrategy", + "DirectoryUploadStrategy", + "choose_upload_strategy", +] + +import contextlib +import os +import tempfile +from abc import ( + ABC, + abstractmethod, +) +from pathlib import Path +from typing import TYPE_CHECKING + +from mlflow.artifacts import download_artifacts +from mlflow.entities import FileInfo +from mlflow.entities import Run as MlflowRun + +if TYPE_CHECKING: + try: + from neptune import Run + except ImportError: + from neptune.new import Run + + +def get_dir_size(path: str) -> int: + directory = Path(path) + return sum(f.stat().st_size for f in directory.glob("**/*") if f.is_file()) + + +@contextlib.contextmanager +def preserve_cwd(path): + cwd = os.getcwd() + os.chdir(path) + try: + yield + finally: + os.chdir(cwd) + + +class ArtifactUploadStrategy(ABC): + BASE_NAMESPACE = "artifacts" + + def __init__( + self, + *, + tracking_uri: str, + max_file_size: int, + ): + + self._uri = tracking_uri + self._max_size = max_file_size + + @abstractmethod + def upload_to_neptune(self, neptune_run: "Run", info: FileInfo) -> None: + ... + + def upload_artifact(self, neptune_run: "Run", info: FileInfo, mlflow_run: MlflowRun) -> None: + with tempfile.TemporaryDirectory() as tmpdirname: + with preserve_cwd(tmpdirname): + if not info.is_dir and info.file_size > self._max_size: + return + + artifact_uri = mlflow_run.info.artifact_uri + download_artifacts(artifact_uri=artifact_uri + "/" + info.path, dst_path=tmpdirname) + + if info.is_dir and get_dir_size(tmpdirname) > self._max_size: + return + + self.upload_to_neptune(neptune_run, info) + + +class FileUploadStrategy(ArtifactUploadStrategy): + def upload_to_neptune(self, neptune_run: "Run", info: FileInfo) -> None: + neptune_run[f"{self.BASE_NAMESPACE}/{info.path}"].upload(info.path, wait=True) + + +class DirectoryUploadStrategy(ArtifactUploadStrategy): + def upload_to_neptune(self, neptune_run: "Run", info: FileInfo) -> None: + dir_path = str(Path(info.path) / "*") + neptune_run[f"{self.BASE_NAMESPACE}/{info.path}"].upload_files(dir_path, wait=True) + + +def choose_upload_strategy(artifact: FileInfo, tracking_uri: str, max_artifact_size: int) -> ArtifactUploadStrategy: + if artifact.is_dir: + return DirectoryUploadStrategy( + tracking_uri=tracking_uri, + max_file_size=max_artifact_size, + ) + else: + return FileUploadStrategy( + tracking_uri=tracking_uri, + max_file_size=max_artifact_size, + ) diff --git a/tests/neptune_mlflow_plugin/__init__.py b/src/neptune_mlflow_exporter/impl/components/__init__.py similarity index 63% rename from tests/neptune_mlflow_plugin/__init__.py rename to src/neptune_mlflow_exporter/impl/components/__init__.py index 62a86a5..3ffc88b 100644 --- a/tests/neptune_mlflow_plugin/__init__.py +++ b/src/neptune_mlflow_exporter/impl/components/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, Neptune Labs Sp. z o.o. +# Copyright (c) 2023, Neptune Labs Sp. z o.o. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,3 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # + +__all__ = [ + "Fetcher", + "Exporter", + "ExportConfig", +] + +from neptune_mlflow_exporter.impl.components.config import ExportConfig +from neptune_mlflow_exporter.impl.components.exporter import Exporter +from neptune_mlflow_exporter.impl.components.fetcher import Fetcher diff --git a/src/neptune_mlflow_exporter/impl/components/config.py b/src/neptune_mlflow_exporter/impl/components/config.py new file mode 100644 index 0000000..7802f59 --- /dev/null +++ b/src/neptune_mlflow_exporter/impl/components/config.py @@ -0,0 +1,29 @@ +# +# Copyright (c) 2023, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +__all__ = ["ExportConfig"] + +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class ExportConfig: + exclude_artifacts: bool + max_artifact_size: int + project_name: Optional[str] + api_token: Optional[str] + mlflow_tracking_uri: Optional[str] diff --git a/src/neptune_mlflow_exporter/impl/components/exporter.py b/src/neptune_mlflow_exporter/impl/components/exporter.py new file mode 100644 index 0000000..986e7ff --- /dev/null +++ b/src/neptune_mlflow_exporter/impl/components/exporter.py @@ -0,0 +1,83 @@ +# +# Copyright (c) 2023, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +__all__ = ["Exporter"] + +from datetime import datetime +from typing import Optional + +import mlflow +from mlflow.entities import Experiment +from mlflow.entities import Run as MlflowRun + +from neptune_mlflow_exporter.impl.artifact_strategy import choose_upload_strategy + +try: + from neptune import Run as NeptuneRun +except ImportError: + from neptune.new import Run as NeptuneRun + + +class Exporter: + def __init__(self, client: mlflow.tracking.MlflowClient): + self.mlflow_client = client + + @staticmethod + def export_experiment_metadata(neptune_run: NeptuneRun, experiment: Experiment) -> None: + neptune_run["experiment/experiment_id"] = experiment.experiment_id + neptune_run["experiment/tags"] = experiment.tags + neptune_run["experiment/name"] = experiment.name + + # https://stackoverflow.com/questions/9744775/how-to-convert-integer-timestamp-into-a-datetime + neptune_run["experiment/creation_time"] = datetime.fromtimestamp(experiment.creation_time / 1e3) + neptune_run["experiment/last_update_time"] = datetime.fromtimestamp(experiment.last_update_time / 1e3) + + @staticmethod + def export_run_info(neptune_run: NeptuneRun, mlflow_run: MlflowRun) -> None: + info = dict(mlflow_run.info) + + info["start_time"] = datetime.fromtimestamp(mlflow_run.info.start_time / 1e3) + info["end_time"] = datetime.fromtimestamp(mlflow_run.info.end_time / 1e3) + + neptune_run["run_info"] = info + + def export_run_data(self, neptune_run: NeptuneRun, mlflow_run: MlflowRun) -> None: + data_dict = mlflow_run.data.to_dictionary() + metric_keys = data_dict["metrics"].keys() + del data_dict["metrics"] + + neptune_run["run_data"] = data_dict + + for key in metric_keys: + metrics = self.mlflow_client.get_metric_history( + run_id=mlflow_run.info.run_id, + key=key, + ) + metric_values = list(map(lambda metric: metric.value, metrics)) + metric_timestamps = list(map(lambda metric: metric.timestamp / 1e3, metrics)) + metric_steps = list(map(lambda metric: metric.step, metrics)) + + neptune_run[f"run_data/metrics/{key}"].extend( + metric_values, steps=metric_steps, timestamps=metric_timestamps + ) + + def export_artifacts( + self, neptune_run: NeptuneRun, mlflow_run: MlflowRun, max_artifact_size: int, tracking_uri: Optional[str] + ) -> None: + for artifact in self.mlflow_client.list_artifacts(run_id=mlflow_run.info.run_id): + strategy = choose_upload_strategy(artifact, tracking_uri, max_artifact_size) + + strategy.upload_artifact(neptune_run, artifact, mlflow_run) diff --git a/src/neptune_mlflow_exporter/impl/components/fetcher.py b/src/neptune_mlflow_exporter/impl/components/fetcher.py new file mode 100644 index 0000000..1bc9a44 --- /dev/null +++ b/src/neptune_mlflow_exporter/impl/components/fetcher.py @@ -0,0 +1,105 @@ +# +# Copyright (c) 2023, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +__all__ = ["Fetcher"] + +from collections.abc import MutableMapping +from dataclasses import dataclass +from typing import ( + List, + Set, +) + +import mlflow +from mlflow.entities import Experiment +from mlflow.entities import Run as MlflowRun +from mlflow.entities import ViewType +from neptune import Project + + +@dataclass +class FetchedData: + mlflow_experiments: MutableMapping[str, Experiment] + mlflow_runs: List[MlflowRun] + neptune_run_ids: Set[str] + + +class Fetcher: + def __init__(self, project: Project, client: mlflow.tracking.MlflowClient): + self.project = project + self.mlflow_client = client + + def get_all_mlflow_experiments(self) -> MutableMapping[str, Experiment]: + page_limit = 100 + all_experiments = [] + page_token = None + + experiment_mapping = {} + + while not all_experiments or page_token: + experiments = self.mlflow_client.search_experiments( + max_results=page_limit, page_token=page_token, view_type=ViewType.ACTIVE_ONLY + ) + + all_experiments.extend(experiments) + page_token = experiments.token + + for exp in all_experiments: + experiment_mapping[exp.experiment_id] = exp + + return experiment_mapping + + def get_all_mlflow_runs(self, experiment_ids: List[str]) -> List[MlflowRun]: + page_limit = 100 + all_runs = [] + page_token = None + + while not all_runs or page_token: + runs = self.mlflow_client.search_runs( + experiment_ids=experiment_ids, run_view_type=ViewType.ALL, max_results=page_limit, page_token=page_token + ) + + all_runs.extend(runs) + + page_token = runs.token + + return all_runs + + def get_existing_neptune_run_ids(self) -> Set[str]: + try: + existing_neptune_run_ids = { + run_id for run_id in self.project.fetch_runs_table().to_pandas()["sys/custom_run_id"].to_list() + } + except KeyError: + # empty project + existing_neptune_run_ids = set() + + return existing_neptune_run_ids + + def fetch_data(self) -> FetchedData: + experiments = self.get_all_mlflow_experiments() + + experiment_ids = list(experiments.keys()) + + mlflow_runs = self.get_all_mlflow_runs(experiment_ids) + + neptune_run_ids = self.get_existing_neptune_run_ids() + + return FetchedData( + mlflow_experiments=experiments, + mlflow_runs=mlflow_runs, + neptune_run_ids=neptune_run_ids, + ) diff --git a/src/neptune_mlflow_exporter/impl/neptune_exporter.py b/src/neptune_mlflow_exporter/impl/neptune_exporter.py new file mode 100644 index 0000000..aae17e3 --- /dev/null +++ b/src/neptune_mlflow_exporter/impl/neptune_exporter.py @@ -0,0 +1,63 @@ +# +# Copyright (c) 2023, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from typing import Optional + +import mlflow + +try: + from neptune import Project +except ImportError: + from neptune.new.metadata_containers import Project + +from neptune_mlflow_exporter.impl.components import ( + ExportConfig, + Exporter, + Fetcher, +) +from neptune_mlflow_exporter.impl.orchestrator import ExportOrchestrator + + +class NeptuneExporter: + def __init__( + self, + project: Project, + *, + project_name: Optional[str] = None, + api_token: Optional[str] = None, + mlflow_tracking_uri: Optional[str] = None, + exclude_artifacts: bool = False, + max_artifact_size: int = 50, + ): + self.project = project + self.project_name = project_name + self.api_token = api_token + self.mlflow_tracking_uri = mlflow_tracking_uri + self.exclude_artifacts = exclude_artifacts + self.max_artifact_size = int(max_artifact_size * (1024 * 1024)) # to bytes + self.mlflow_client = mlflow.tracking.MlflowClient(tracking_uri=self.mlflow_tracking_uri) + + def run(self) -> None: + ExportOrchestrator( + fetcher=Fetcher(self.project, self.mlflow_client), + exporter=Exporter(self.mlflow_client), + config=ExportConfig( + exclude_artifacts=self.exclude_artifacts, + max_artifact_size=self.max_artifact_size, + project_name=self.project_name, + api_token=self.api_token, + mlflow_tracking_uri=self.mlflow_tracking_uri, + ), + ).run() diff --git a/src/neptune_mlflow_exporter/impl/orchestrator.py b/src/neptune_mlflow_exporter/impl/orchestrator.py new file mode 100644 index 0000000..ffe0b2c --- /dev/null +++ b/src/neptune_mlflow_exporter/impl/orchestrator.py @@ -0,0 +1,67 @@ +# +# Copyright (c) 2023, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +__all__ = ["ExportOrchestrator"] + +import click + +try: + from neptune import Run as NeptuneRun +except ImportError: + from neptune.new import Run as NeptuneRun + +from neptune_mlflow_exporter.impl.components import ( + ExportConfig, + Exporter, + Fetcher, +) + + +class ExportOrchestrator: + def __init__(self, fetcher: Fetcher, exporter: Exporter, config: ExportConfig): + + self.fetcher = fetcher + self.exporter = exporter + self.config = config + + def run(self) -> None: + fetched_data = self.fetcher.fetch_data() + + for mlflow_run in fetched_data.mlflow_runs: + if mlflow_run.info.run_id in fetched_data.neptune_run_ids: + click.echo(f"Ignoring mlflow_run '{mlflow_run.info.run_name}' since it already exists") + continue + + click.echo(f"Loading mlflow_run '{mlflow_run.info.run_name}'") + + with NeptuneRun( + project=self.config.project_name, api_token=self.config.api_token, custom_run_id=mlflow_run.info.run_id + ) as neptune_run: + try: + experiment = fetched_data.mlflow_experiments[mlflow_run.info.experiment_id] + self.exporter.export_experiment_metadata(neptune_run, experiment) + + self.exporter.export_run_info(neptune_run, mlflow_run) + self.exporter.export_run_data(neptune_run, mlflow_run) + + if not self.config.exclude_artifacts: + self.exporter.export_artifacts( + neptune_run, mlflow_run, self.config.max_artifact_size, self.config.mlflow_tracking_uri + ) + + click.echo(f"Run '{mlflow_run.info.run_name}' was saved") + except Exception as e: + click.echo(f"Error exporting run '{mlflow_run.info.run_name}': {e}") diff --git a/src/neptune_mlflow_exporter/impl/sync.py b/src/neptune_mlflow_exporter/impl/sync.py new file mode 100644 index 0000000..0b34bc0 --- /dev/null +++ b/src/neptune_mlflow_exporter/impl/sync.py @@ -0,0 +1,49 @@ +# +# Copyright (c) 2023, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from typing import Optional + +try: + from neptune import init_project + from neptune.integrations.utils import verify_type +except ImportError: + from neptune.new import init_project + from neptune.new.integrations.utils import verify_type + +from neptune_mlflow_exporter.impl import NeptuneExporter + + +def sync( + project_name: Optional[str] = None, + api_token: Optional[str] = None, + mlflow_tracking_uri: Optional[str] = None, + exclude_artifacts: bool = False, + max_artifact_size: int = 50, +) -> None: + + verify_type("max_artifact_size", max_artifact_size, int) + + if max_artifact_size <= 0: + raise ValueError("Max artifact size must be a positive integer") + + with init_project(project=project_name, api_token=api_token) as project: + NeptuneExporter( + project=project, + project_name=project_name, + api_token=api_token, + mlflow_tracking_uri=mlflow_tracking_uri, + exclude_artifacts=exclude_artifacts, + max_artifact_size=max_artifact_size, + ).run() diff --git a/src/neptune_mlflow_exporter/impl/version.py b/src/neptune_mlflow_exporter/impl/version.py new file mode 100644 index 0000000..a3beacd --- /dev/null +++ b/src/neptune_mlflow_exporter/impl/version.py @@ -0,0 +1,47 @@ +# +# Copyright (c) 2023, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +__all__ = ["__version__"] + +import sys +from importlib.util import find_spec + +if sys.version_info >= (3, 8): + from importlib.metadata import ( + PackageNotFoundError, + version, + ) +else: + from importlib_metadata import ( + PackageNotFoundError, + version, + ) + +if not (find_spec("neptune") or find_spec("neptune-client")): + msg = """ + The Neptune client library was not found. + + Install the neptune package with + `pip install neptune` + + Need help? -> https://docs.neptune.ai/setup/installation/""" + raise PackageNotFoundError(msg) + +try: + __version__ = version("neptune-mlflow") +except PackageNotFoundError: + # package is not installed + pass diff --git a/src/neptune_mlflow_plugin/__init__.py b/src/neptune_mlflow_plugin/__init__.py deleted file mode 100644 index b412d28..0000000 --- a/src/neptune_mlflow_plugin/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import click - - -@click.command("mlflow") -@click.argument("path", required=False) -@click.option("--project", "-p", help="Project name") -def sync(path, project): - """Upload mlflow runs data to Neptune. - PATH is a directory where Neptune will look for `mlruns` directory with mlflow data. - - Examples: - - neptune mlflow . - - neptune mlflow /path - - neptune mlflow /path --project username/sandbox - - """ - - # We do not want to import anything if process was executed for autocompletion purposes. - from neptune_mlflow.sync import sync as run_sync - - return run_sync(path=path, project=project) diff --git a/test_requirements.txt b/test_requirements.txt deleted file mode 100644 index 33ff26e..0000000 --- a/test_requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -astroid==1.6.1 -bunch==1.0.1 -mock==2.0.0 -psutil==5.6.6 -pytest==4.1.0 -pytest-cov==2.6.1 -pytest-xdist==1.25.0 -tox==3.6.1 -attrs==17.4.0 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..78c6f47 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,60 @@ +try: + from neptune import ( + Project, + init_project, + ) +except ImportError: + from neptune.new import init_project + from neptune.new import Project + +import pytest +import tensorflow as tf + +from neptune_mlflow_exporter.impl import NeptuneExporter + + +@pytest.fixture(scope="session") +def dataset(): + x_train = tf.random.uniform(shape=[2, 28, 28]) + y_train = tf.constant([1, 1], shape=(2, 1), dtype=tf.int8) + x_test = tf.random.uniform(shape=[2, 28, 28]) + y_test = tf.constant([1, 1], shape=(2, 1), dtype=tf.int8) + + return (x_train, y_train), (x_test, y_test) + + +@pytest.fixture(scope="session") +def model(): + model = tf.keras.models.Sequential( + [ + # We are *not* providing input_size to the first layer, so the test will catch also the case where the + # model was not build yet: https://stackoverflow.com/q/55908188/3986320 + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(10), + ] + ) + lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( + initial_learning_rate=1e-2, decay_steps=10000, decay_rate=0.9 + ) + optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule) + + loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"]) + return model + + +@pytest.fixture(scope="session") +def neptune_project() -> Project: + with init_project(mode="debug", project="organization/project") as project: + yield project + + +@pytest.fixture(scope="session") +def neptune_exporter(neptune_project) -> NeptuneExporter: + yield NeptuneExporter(project=neptune_project, mlflow_tracking_uri="test_tracking_uri") + + +@pytest.fixture(scope="session") +def neptune_exporter_e2e() -> NeptuneExporter: + project = init_project() + yield NeptuneExporter(project, exclude_artifacts=False) diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/e2e/test_neptune_mlflow.py b/tests/e2e/test_neptune_mlflow.py new file mode 100644 index 0000000..48afc7e --- /dev/null +++ b/tests/e2e/test_neptune_mlflow.py @@ -0,0 +1,90 @@ +import mlflow +import mlflow.keras +import mlflow.tensorflow +import neptune + +EPOCHS = 1 +BATCH_SIZE = 1 +MODEL_NAME = "test_model" +MLFLOW_EXPERIMENT_NAME = "E2E neptune experiment" +MLFLOW_RUN_TAGS = { + "tag1": "value1", + "tag2": "value2", +} + + +def train(dataset, model): + mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME) + mlflow.keras.autolog() + + with mlflow.start_run() as run: + mlflow.set_tags(MLFLOW_RUN_TAGS) + run_id = run.info.run_id + + (x_train, y_train), (x_test, y_test) = dataset + + model.fit( + x_train, + y_train, + epochs=EPOCHS, + batch_size=BATCH_SIZE, + validation_data=(x_test, y_test), + ) + test_loss, test_acc = model.evaluate(x_test, y_test) + + mlflow.log_param("epochs", EPOCHS) + mlflow.log_param("batch_size", BATCH_SIZE) + + mlflow.log_metric("test_acc", test_acc) + mlflow.log_metric("test_loss", test_loss) + + # Save as TensorFlow SavedModel format (MLflow Keras default) + mlflow.keras.log_model(model, "keras-model", registered_model_name=MODEL_NAME) + + # write model summary + summary = [] + model.summary(print_fn=summary.append) + summary = "\n".join(summary) + with open("model_summary.txt", "w") as f: + f.write(summary) + mlflow.log_artifact("model_summary.txt") + return run_id + + +def test_e2e(dataset, model, neptune_exporter_e2e): + + run_id = train(dataset, model) + + neptune_exporter_e2e.run() + + # check logged project metadata + experiment = mlflow.get_experiment_by_name("E2E neptune experiment") + + # check logged run metadata + neptune_run = neptune.init_run(custom_run_id=run_id) + + # experiment + assert neptune_run["experiment/experiment_id"].fetch() == experiment.experiment_id + assert neptune_run["experiment/name"].fetch() == MLFLOW_EXPERIMENT_NAME + assert neptune_run.exists("experiment/creation_time") + assert neptune_run.exists("experiment/last_update_time") + + # run info + assert neptune_run["run_info/lifecycle_stage"].fetch() == "active" + assert neptune_run.exists("run_info/status") + + # run data + assert set(MLFLOW_RUN_TAGS.items()).issubset(set(neptune_run["run_data/tags"].fetch().items())) + + assert neptune_run.exists("run_data/metrics/accuracy") + assert neptune_run.exists("run_data/metrics/test_loss") + assert neptune_run.exists("run_data/metrics/val_accuracy") + assert neptune_run.exists("run_data/metrics/loss") + + assert int(neptune_run["run_data/params/epochs"].fetch()) == EPOCHS + assert int(neptune_run["run_data/params/batch_size"].fetch()) == BATCH_SIZE + + # logged artifacts + assert neptune_run.exists("artifacts/model_summary.txt") + assert neptune_run.exists("artifacts/keras-model") + assert neptune_run.exists("artifacts/tensorboard_logs") diff --git a/tests/neptune_mlflow/test_data_loader.py b/tests/neptune_mlflow/test_data_loader.py deleted file mode 100644 index cd94147..0000000 --- a/tests/neptune_mlflow/test_data_loader.py +++ /dev/null @@ -1,105 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import unittest -import uuid - -from mock import MagicMock - -from neptune_mlflow.data_loader import DataLoader - - -class TestDataLodaer(unittest.TestCase): - def test_get_run_qualified_name(self): - # given - exp = MagicMock() - exp.name = "exp_name" - - # and - run_info = MagicMock() - run_info.name = None - run_info.run_id = "run_uuid" - - # expect - self.assertEqual(DataLoader._get_run_qualified_name(exp, run_info), "exp_name/run_uuid") - - def test_get_metric_file(self): - # given - exp = MagicMock() - exp.experiment_id = "exp_id" - - # and - run_info = MagicMock() - run_info.run_uuid = "run_uuid" - - # and - metric_key = "metric_key" - - # expect - self.assertEqual( - DataLoader._get_metric_file(exp, run_info, metric_key), "mlruns/exp_id/run_uuid/metrics/metric_key" - ) - - def test_get_tags(self): - # given - exp = MagicMock() - exp.name = "EXPeriMENT-NaMe" - - # and - run = MagicMock() - run.info = MagicMock() - run.data.tags = {} - - # expect - self.assertEqual(set(DataLoader._get_tags(exp, run)), {"mlflow", "experiment-name"}) - - def test_get_properties(self): - # given - exp = MagicMock() - exp.experiment_id = 123 - exp.name = "EXPeriMENT-NaMe" - - # and - run = MagicMock() - - # and - run.info = MagicMock() - run.info.run_uuid = str(uuid.uuid4()) - - # and - run.data = MagicMock() - run.data.tags = {"key1": "value1", "key2": "value2"} - - # expect - self.assertEqual( - DataLoader._get_properties(exp, run), - { - "mlflow/experiment/id": str(exp.experiment_id), - "mlflow/experiment/name": exp.name, - "mlflow/run/uuid": run.info.run_uuid, - "mlflow/run/name": "", - "key1": "value1", - "key2": "value2", - }, - ) - - def test_get_params(self): - # given - run = MagicMock() - run.data = MagicMock() - run.data.params = {"key1": "value1", "key2": "value2"} - - # expect - self.assertEqual(DataLoader._get_params(run), {"key1": "value1", "key2": "value2"}) diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/neptune_mlflow/__init__.py b/tests/unit/neptune_mlflow_exporter/__init__.py similarity index 100% rename from src/neptune_mlflow/__init__.py rename to tests/unit/neptune_mlflow_exporter/__init__.py diff --git a/tests/unit/neptune_mlflow_exporter/test_artifact_strategy.py b/tests/unit/neptune_mlflow_exporter/test_artifact_strategy.py new file mode 100644 index 0000000..4481f90 --- /dev/null +++ b/tests/unit/neptune_mlflow_exporter/test_artifact_strategy.py @@ -0,0 +1,39 @@ +from unittest.mock import ( + MagicMock, + patch, +) + +from mlflow.entities import FileInfo + +from neptune_mlflow_exporter.impl.artifact_strategy import ( + DirectoryUploadStrategy, + FileUploadStrategy, +) + + +@patch("neptune_mlflow_exporter.impl.artifact_strategy.download_artifacts") +def test_file_upload_strategy_does_not_upload_file_above_limit_size(mock_download_artifacts): + strategy = FileUploadStrategy(tracking_uri="", max_file_size=500) + file_info = FileInfo("some_path", False, 1000) + + strategy.upload_artifact(MagicMock(), file_info, MagicMock()) + + mock_download_artifacts.assert_not_called() + + +@patch("neptune.handler.Handler.upload_files") +@patch("neptune_mlflow_exporter.impl.artifact_strategy.download_artifacts") +@patch("neptune_mlflow_exporter.impl.artifact_strategy.get_dir_size", return_value=1000) +def test_directory_upload_strategy_does_not_upload_dir_above_limit_size( + mock_get_dir_size, mock_download_artifacts, mock_upload +): + strategy = DirectoryUploadStrategy(tracking_uri="", max_file_size=500) + file_info = FileInfo("some_path", True, None) + + strategy.upload_artifact(MagicMock(), file_info, MagicMock()) + + # size estimated after dir is downloaded + mock_download_artifacts.assert_called_once() + mock_get_dir_size.assert_called_once() + + mock_upload.assert_not_called() diff --git a/tests/unit/neptune_mlflow_exporter/test_neptune_exporter.py b/tests/unit/neptune_mlflow_exporter/test_neptune_exporter.py new file mode 100644 index 0000000..8de2fae --- /dev/null +++ b/tests/unit/neptune_mlflow_exporter/test_neptune_exporter.py @@ -0,0 +1,3 @@ +class TestNeptuneExporter: + def test_client_received_correct_tracking_uri(self, neptune_exporter): + assert neptune_exporter.mlflow_client.tracking_uri == "test_tracking_uri" diff --git a/tests/neptune_mlflow_plugin/test_plugin.py b/tests/unit/neptune_mlflow_exporter/test_plugin.py similarity index 53% rename from tests/neptune_mlflow_plugin/test_plugin.py rename to tests/unit/neptune_mlflow_exporter/test_plugin.py index 8743395..86b2efd 100644 --- a/tests/neptune_mlflow_plugin/test_plugin.py +++ b/tests/unit/neptune_mlflow_exporter/test_plugin.py @@ -13,27 +13,32 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import random import unittest +from unittest.mock import patch from click.testing import CliRunner -from neptune_mlflow_plugin import sync +from neptune_mlflow_exporter import sync class TestPlugin(unittest.TestCase): runner = CliRunner() - def test_path_not_exist(self): - path = "/tmp/{}".format(random.randint(10000, 1000000)) - result = self.runner.invoke(sync, [path]) + @patch("neptune_mlflow_exporter.impl.sync.sync") + def test_sync_called_once(self, mock_sync): + result = self.runner.invoke(sync) + + self.assertEqual(result.exit_code, 0) + + mock_sync.assert_called_once_with( + project_name=None, api_token=None, mlflow_tracking_uri=None, exclude_artifacts=False, max_artifact_size=50 + ) + + def test_invalid_max_artifact_size(self): + result = self.runner.invoke(sync, ["-m", -100]) self.assertEqual(result.exit_code, 1) - self.assertEqual(result.output.strip(), "ERROR: Directory `{}` doesn't exist".format(path)) + self.assertIsInstance(result.exception, ValueError) - def test_path_is_not_dir(self): - path = "/tmp/{}".format(random.randint(10000, 1000000)) - with open(path, "a") as f: - f.write("text") - result = self.runner.invoke(sync, [path]) + result = self.runner.invoke(sync, ["-m", 0]) self.assertEqual(result.exit_code, 1) - self.assertEqual(result.output.strip(), "ERROR: `{}` is not a directory".format(path)) + self.assertIsInstance(result.exception, ValueError) diff --git a/tests/unit/neptune_mlflow_exporter/test_sync.py b/tests/unit/neptune_mlflow_exporter/test_sync.py new file mode 100644 index 0000000..4897670 --- /dev/null +++ b/tests/unit/neptune_mlflow_exporter/test_sync.py @@ -0,0 +1,14 @@ +import pytest + +from neptune_mlflow_exporter.impl.sync import sync + + +def test_invalid_max_artifact_size() -> None: + with pytest.raises(ValueError): + sync(max_artifact_size=0) + + with pytest.raises(ValueError): + sync(max_artifact_size=-100) + + with pytest.raises(TypeError): + sync(max_artifact_size=50.5) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 37345ea..0000000 --- a/tox.ini +++ /dev/null @@ -1,29 +0,0 @@ -[tox] -# platform specification support is available since version 2.0 -minversion = 2.0 -envlist = py{27,35,36}-{linux,win32} - -[testenv] -# See https://docs.python.org/2/library/sys.html#sys.platform for platform codes -platform = linux: linux - win32: win32 - -deps = -rtest_requirements.txt - -rrequirements.txt - -commands = pytest \ - --cov-config tox.ini \ - --cov-report xml:tests/coverage-{envname}.xml \ - --cov . \ - -n 4 \ - --junitxml tests/results-{envname}.xml \ - {posargs} \ - tests/neptune_mlflow - -setenv = PYTHONIOENCODING=UTF-8 - -[coverage:run] -omit = .tox/* - -[coverage:report] -exclude_lines = unittest.main