diff --git a/cli/README.md b/cli/README.md new file mode 100644 index 000000000..4a3cd3bfc --- /dev/null +++ b/cli/README.md @@ -0,0 +1,66 @@ +## Available Commands +(Explore more advanced options in the code, this is basic usage demonstration) + +### generate-item-yaml +Generate an `item.yaml` file (basic draft) in the appropriate directory from a Jinja2 template + +Usage: + `python -m cli.cli generate-item-yaml TYPE NAME` + +Example: + `python -m cli.cli generate-item-yaml function aggregate` + +--- + +### item-to-function +Creates a `function.yaml` file based on a provided `item.yaml` file. + +Usage: + `python -m cli.cli item-to-function --item-path PATH` + +Example: + `python -m cli.cli item-to-function --item-path functions/src/aggregate` + +--- + +### function-to-item +Creates a `item.yaml` file based on a provided `function.yaml` file. + +Usage: + `python -m cli.cli function-to-item PATH` + +Example: + `python -m cli.cli function-to-item --path functions/src/aggregate` + +--- + +### run-tests +Run assets test suite. + +Usage: + `python -m cli.cli run-tests -r PATH -s TYPE -fn NAME` + +Example: + `python -m cli.cli run-tests -r functions/src/aggregate -s py -fn aggregate` + +--- + +### build-marketplace +Build and push (create a PR) the updated marketplace/ directory (e.g: marketplace/functions) + +Usage: + `python -m cli.cli build-marketplace -s SOURCE-DIR -sn TYPE -m MARKETPLACE-DIR -c CHANNEL -v -f` + +Example: + `python -m cli.cli build-marketplace -s ./functions/src -sn functions -m marketplace -c master -v -f` + +--- + +### update-readme +Regenerate the `README.md` files in each of the asset directories (functions/modules). + +Usage: + `python -m cli.cli update-readme --asset TYPE` + +Example: + `python -m cli.cli update-readme --asset functions --asset modules` \ No newline at end of file diff --git a/cli/cli.py b/cli/cli.py index 8fee9891a..e8e6922fe 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -17,22 +17,19 @@ from cli.functions.function_to_item import function_to_item_cli from cli.functions.item_to_function import item_to_function_cli from cli.marketplace.build import build_marketplace_cli -from cli.functions.new_function_item import new_item as new_function_item from cli.common.test_suite import test_suite -from cli.common.item_yaml import update_functions_yaml from cli.common.update_readme import update_readme +from cli.common.generate_item_yaml import generate_item_yaml @click.group() def cli(): pass - -cli.add_command(new_function_item) +cli.add_command(generate_item_yaml, name="generate-item-yaml") cli.add_command(item_to_function_cli, name="item-to-function") cli.add_command(function_to_item_cli, name="function-to-item") cli.add_command(test_suite, name="run-tests") cli.add_command(build_marketplace_cli, name="build-marketplace") -cli.add_command(update_functions_yaml, name="update-functions-yaml") cli.add_command(update_readme, name="update-readme") if __name__ == "__main__": diff --git a/cli/common/generate_item_yaml.py b/cli/common/generate_item_yaml.py new file mode 100644 index 000000000..9ce362c37 --- /dev/null +++ b/cli/common/generate_item_yaml.py @@ -0,0 +1,55 @@ +import sys +from pathlib import Path +from datetime import datetime +import click +from jinja2 import Environment, FileSystemLoader + +TEMPLATES = { + "function": "cli/utils/function_item_template.yaml.j2", + "module": "cli/utils/module_item_template.yaml.j2", +} + + +@click.command() +@click.argument("type", type=click.Choice(list(TEMPLATES.keys()))) +@click.argument("name") +@click.option("--overwrite", is_flag=True, help="Replace existing file instead of raising an error.") +def generate_item_yaml(type: str, name: str, overwrite: bool = False): + """ + Generate an item.yaml file from a template. + +type: one of the supported types (currently only `function` or `module`) +name: the function/module name (also used as the directory name) +overwrite: whether to overwrite existing item.yaml file + """ + # Construct the target path + path = Path(f"{type}s/src/{name}").resolve() + output_file = path / "item.yaml" + + if not overwrite and output_file.exists(): + click.echo(f"Error: {output_file} already exists.", err=True) + sys.exit(1) + + if not path.exists(): + click.echo(f"Error: {path} does not exist.", err=True) + sys.exit(1) + + # Render parameters + params = { + "example": f"{name}.ipynb", + "generationDate": datetime.utcnow().strftime("%Y-%m-%d"), + "name": name, + "filename": f"{name}.py", + } + + # Load and render template + env = Environment(loader=FileSystemLoader(".")) + template = env.get_template(TEMPLATES[type]) + rendered = template.render(params) + + output_file.write_text(rendered) + click.echo(f"Created {output_file}") + + +if __name__ == "__main__": + generate_item_yaml() \ No newline at end of file diff --git a/cli/common/item_yaml.py b/cli/common/item_yaml.py deleted file mode 100644 index a14ea48c2..000000000 --- a/cli/common/item_yaml.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import click -from cli.utils.path_iterator import PathIterator -from cli.utils.helpers import is_item_dir -import yaml -import datetime - - -@click.command() -@click.option("-r", "--root-directory", default=".", help="Path to root directory") -@click.option("-v", "--version", help="update version number in function item yaml") -@click.option("-mv", "--mlrun-version", help="update mlrun version in function item.yaml") -@click.option("-p", "--platform-version", help="update platform version in function item.yaml") -@click.option("-d", "--date-time", help="update date-time in function item.yaml") -def update_functions_yaml(root_directory: str, - version: str, - mlrun_version: str, - platform_version: str, - date_time: str): - if not root_directory: - click.echo("-r/--root-directory is required") - exit(1) - - item_iterator = PathIterator(root=root_directory, rule=is_item_dir, as_path=True) - for inner_dir in item_iterator: - item_yaml = "item.yaml" - if (inner_dir / item_yaml).exists(): - path = str(inner_dir)+"/"+item_yaml - stream = open(path, 'r') - data = yaml.load(stream=stream, Loader=yaml.FullLoader) - if version: - data['version'] = version - if mlrun_version: - data['mlrunVersion'] = mlrun_version - if platform_version: - data['platformVersion'] = platform_version - if date_time: - data['generationDate'] = datetime.datetime.now().strftime('%Y-%m-%d:%H-%M') - print(data) - with open(path, 'w') as yaml_file: - yaml_file.write(yaml.dump(data, default_flow_style=False)) diff --git a/cli/functions/new_function_item.py b/cli/functions/new_function_item.py deleted file mode 100644 index 70eb30d55..000000000 --- a/cli/functions/new_function_item.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from datetime import datetime -from pathlib import Path - -import click - - -@click.command() -@click.option( - "-p", "--path", help="Path to directory in which a new item.yaml will be created" -) -@click.option("-o", "--override", is_flag=True, help="Override if already exists") -def new_item(path: str, override: bool): - path = Path(path) / "item.yaml" - - if not path.parent.exists(): - path.parent.mkdir(parents=True) - elif path.exists() and not override: - click.echo( - f"{path / 'item.yaml'} already exists, set [-o, --override] to override" - ) - exit(1) - - with open(path, "w") as f: - f.write( - f""" -apiVersion: v1 -categories: [] # List of category names -description: '' # Short description -doc: '' # Path to README.md if exists -example: '' # Path to examole notebook -generationDate: {str(datetime.utcnow())} -icon: '' # Path to icon file -labels: {{}} # Key values label pairs -maintainers: [] # List of maintainers -mlrunVersion: '' # Function’s MLRun version requirement, should follow python’s versioning schema -name: '' # Function name -platformVersion: '' # Function’s Iguazio version requirement, should follow python’s versioning schema -spec: - filename: '' # Implementation file - handler: '' # Handler function name - image: '' # Base image name - kind: '' # Function kind - requirements: [] # List of Pythonic library requirements - customFields: {{}} # Custom spec fields - env: [] # Spec environment params -url: '' -version: 0.0.1 # Function version, should follow standard semantic versioning schema -""" - ) - - -if __name__ == "__main__": - new_item() diff --git a/cli/utils/function_item_template.yaml.j2 b/cli/utils/function_item_template.yaml.j2 new file mode 100644 index 000000000..da35ef819 --- /dev/null +++ b/cli/utils/function_item_template.yaml.j2 @@ -0,0 +1,22 @@ +apiVersion: v1 +categories: [] {# List of category names #} +description: '' {# Short description #} +doc: '' {# Path to README.md if exists #} +example: {{ example|default('') }} {# Path to example notebook #} +generationDate: {{ generationDate|default('') }} {# Automatically generated ISO8086 datetime #} +hidden: false {# Hide function from the UI #} +icon: '' {# Path to icon file #} +labels: {# Key values label pairs #} + author: Iguazio +maintainers: [] {# List of maintainers #} +mlrunVersion: '' {# Function’s MLRun version requirement, should follow python’s versioning schema #} +name: {{ name|default('') }} {# Function name #} +platformVersion: '' {# Function’s Iguazio version requirement, should follow python’s versioning schema #} +spec: + filename: {{ filename|default('') }} {# Implementation file #} + handler: '' {# Handler function name #} + image: mlrun/mlrun {# Base image name #} + kind: '' {# Function kind #} + requirements: [] {# List of Pythonic library requirements #} +url: '' +version: 1.0.0 {# Function version, should follow standard semantic versioning schema #} \ No newline at end of file diff --git a/cli/utils/item_template.yaml b/cli/utils/item_template.yaml deleted file mode 100644 index b1d38d334..000000000 --- a/cli/utils/item_template.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: v1 -categories: [] # List of category names -description: '' # Short description -doc: '' # Path to README.md if exists -example: '' # Path to examole notebook -generationDate: '' # Automatically generated ISO8086 datetime -hidden: false # Hide function from the UI -icon: '' # Path to icon file -labels: {} # Key values label pairs -maintainers: [] # List of maintainers -mlrunVersion: '' # Function’s MLRun version requirement, should follow python’s versioning schema -name: '' # Function name -platformVersion: '' # Function’s Iguazio version requirement, should follow python’s versioning schema -spec: - filename: '' # Implementation file - handler: '' # Handler function name - image: '' # Base image name - kind: '' # Function kind - requirements: [] # List of Pythonic library requirements -url: '' # ??? -version: '' # Function version, should follow standard semantic versioning schema \ No newline at end of file diff --git a/cli/utils/module_item_template.yaml.j2 b/cli/utils/module_item_template.yaml.j2 new file mode 100644 index 000000000..539cd6f0a --- /dev/null +++ b/cli/utils/module_item_template.yaml.j2 @@ -0,0 +1,16 @@ +apiVersion: v1 +categories: [] {# List of category names #} +description: '' {# Short description #} +example: {{ example|default('') }} {# Path to example notebook #} +generationDate: {{ generationDate|default('') }} {# Automatically generated ISO8086 datetime #} +hidden: false {# Hide Module from the UI #} +labels: + author: Iguazio +mlrunVersion: '' {# Module’s MLRun version requirement, should follow python’s versioning schema #} +name: {{ name|default('') }} {# Module name #} +spec: + filename: {{ filename|default('') }} {# Implementation file #} + image: mlrun/mlrun {# Base image name #} + kind: '' {# Module kind #} + requirements: [] {# List of Pythonic library requirements #} +version: 1.0.0 {# Module version, should follow standard semantic versioning schema #} \ No newline at end of file diff --git a/functions/src/aggregate/item.yaml b/functions/src/aggregate/item.yaml index 75f7e74c5..43e87a4a2 100644 --- a/functions/src/aggregate/item.yaml +++ b/functions/src/aggregate/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: avia + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/arc_to_parquet/item.yaml b/functions/src/arc_to_parquet/item.yaml index 4bc2634ce..fe2925aef 100644 --- a/functions/src/arc_to_parquet/item.yaml +++ b/functions/src/arc_to_parquet/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: avi + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/auto_trainer/item.yaml b/functions/src/auto_trainer/item.yaml index 7e622db29..ba33f6a08 100755 --- a/functions/src/auto_trainer/item.yaml +++ b/functions/src/auto_trainer/item.yaml @@ -10,7 +10,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yonish + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/azureml_serving/function.yaml b/functions/src/azureml_serving/function.yaml index b2242da1d..978806878 100644 --- a/functions/src/azureml_serving/function.yaml +++ b/functions/src/azureml_serving/function.yaml @@ -5,7 +5,7 @@ metadata: hash: c0f404820b8f0fe92d2d1cfe9dbcc068be1a13bf project: '' labels: - author: yonish + author: Iguazio categories: - machine-learning - model-serving diff --git a/functions/src/azureml_serving/item.yaml b/functions/src/azureml_serving/item.yaml index d20e636b0..93fb046b2 100644 --- a/functions/src/azureml_serving/item.yaml +++ b/functions/src/azureml_serving/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yonish + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/functions/src/azureml_utils/item.yaml b/functions/src/azureml_utils/item.yaml index 342307643..ae33ad5b1 100644 --- a/functions/src/azureml_utils/item.yaml +++ b/functions/src/azureml_utils/item.yaml @@ -10,7 +10,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yonish + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/batch_inference/item.yaml b/functions/src/batch_inference/item.yaml index 16a56cfe7..65b61431e 100644 --- a/functions/src/batch_inference/item.yaml +++ b/functions/src/batch_inference/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: guyl + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/batch_inference_v2/item.yaml b/functions/src/batch_inference_v2/item.yaml index 775579b9e..8b8f01df0 100644 --- a/functions/src/batch_inference_v2/item.yaml +++ b/functions/src/batch_inference_v2/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-08-07:12-25 hidden: false icon: '' labels: - author: eyald + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0-rc51 diff --git a/functions/src/describe/item.yaml b/functions/src/describe/item.yaml index 2c41a025f..da26f1501 100644 --- a/functions/src/describe/item.yaml +++ b/functions/src/describe/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: Davids + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/describe_spark/item.yaml b/functions/src/describe_spark/item.yaml index 6c4ad32d9..58e267d4a 100644 --- a/functions/src/describe_spark/item.yaml +++ b/functions/src/describe_spark/item.yaml @@ -7,7 +7,8 @@ example: describe_spark.ipynb generationDate: 2022-08-28:17-25 hidden: false icon: '' -labels: {} +labels: + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/functions/src/feature_selection/item.yaml b/functions/src/feature_selection/item.yaml index 5356024df..4f9a3a5dd 100644 --- a/functions/src/feature_selection/item.yaml +++ b/functions/src/feature_selection/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: orz + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.8.0-rc40 diff --git a/functions/src/gen_class_data/item.yaml b/functions/src/gen_class_data/item.yaml index a6dd94b61..30f5cd21c 100644 --- a/functions/src/gen_class_data/item.yaml +++ b/functions/src/gen_class_data/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: Daniel + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/github_utils/function.yaml b/functions/src/github_utils/function.yaml index fe60cff7a..2d5d93aab 100644 --- a/functions/src/github_utils/function.yaml +++ b/functions/src/github_utils/function.yaml @@ -5,7 +5,7 @@ metadata: hash: d8e639af306794ce6f59eb246f0b845c016c9da4 project: '' labels: - author: yaronh + author: Iguazio categories: - utils spec: diff --git a/functions/src/github_utils/item.yaml b/functions/src/github_utils/item.yaml index c00bf86b2..9c06d84a7 100644 --- a/functions/src/github_utils/item.yaml +++ b/functions/src/github_utils/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/functions/src/hugging_face_serving/item.yaml b/functions/src/hugging_face_serving/item.yaml index 48b063e49..edad986be 100644 --- a/functions/src/hugging_face_serving/item.yaml +++ b/functions/src/hugging_face_serving/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-09-05:17-00 hidden: false icon: '' labels: - author: yonish + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/functions/src/load_dataset/function.yaml b/functions/src/load_dataset/function.yaml index 046bb5cc4..91775a802 100644 --- a/functions/src/load_dataset/function.yaml +++ b/functions/src/load_dataset/function.yaml @@ -5,7 +5,7 @@ metadata: hash: d05aa41d618533335eeaeab38aa434a14e3e3980 project: '' labels: - author: yjb + author: Iguazio framework: sklearn categories: - data-preparation diff --git a/functions/src/load_dataset/item.yaml b/functions/src/load_dataset/item.yaml index d9fcf8d61..fb6f69c40 100644 --- a/functions/src/load_dataset/item.yaml +++ b/functions/src/load_dataset/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yjb + author: Iguazio framework: sklearn maintainers: [] marketplaceType: '' diff --git a/functions/src/mlflow_utils/item.yaml b/functions/src/mlflow_utils/item.yaml index 79304eb38..176a9dd95 100644 --- a/functions/src/mlflow_utils/item.yaml +++ b/functions/src/mlflow_utils/item.yaml @@ -9,7 +9,7 @@ generationDate: 2024-05-23:12-00 hidden: false icon: '' labels: - author: zeevr + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.8.0 diff --git a/functions/src/model_server/item.yaml b/functions/src/model_server/item.yaml index c85cf163d..65c6f09e7 100644 --- a/functions/src/model_server/item.yaml +++ b/functions/src/model_server/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio framework: sklearn maintainers: [] marketplaceType: '' diff --git a/functions/src/model_server_tester/function.yaml b/functions/src/model_server_tester/function.yaml index eda10459e..45934c444 100644 --- a/functions/src/model_server_tester/function.yaml +++ b/functions/src/model_server_tester/function.yaml @@ -5,7 +5,7 @@ metadata: hash: 3b203a2799e44992539eafd32a4b8979bbcc8001 project: '' labels: - author: yaronh + author: Iguazio categories: - monitoring - model-serving diff --git a/functions/src/model_server_tester/item.yaml b/functions/src/model_server_tester/item.yaml index 3e43a9297..b18e0082c 100644 --- a/functions/src/model_server_tester/item.yaml +++ b/functions/src/model_server_tester/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/functions/src/noise_reduction/item.yaml b/functions/src/noise_reduction/item.yaml index f748d5587..c37b4ab39 100644 --- a/functions/src/noise_reduction/item.yaml +++ b/functions/src/noise_reduction/item.yaml @@ -9,7 +9,7 @@ generationDate: 2024-03-04:17-30 hidden: false icon: '' labels: - author: yonatans + author: Iguazio maintainers: [] mlrunVersion: 1.7.0 name: noise-reduction @@ -26,4 +26,5 @@ spec: torchaudio>=2.1.2, ] url: '' -version: 1.1.0 \ No newline at end of file +version: 1.1.0 +test_valid: False \ No newline at end of file diff --git a/functions/src/onnx_utils/item.yaml b/functions/src/onnx_utils/item.yaml index 02134f32d..81ad593d5 100644 --- a/functions/src/onnx_utils/item.yaml +++ b/functions/src/onnx_utils/item.yaml @@ -10,7 +10,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: guyl + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.2 diff --git a/functions/src/open_archive/item.yaml b/functions/src/open_archive/item.yaml index 0a2f4516c..c40a62e4a 100644 --- a/functions/src/open_archive/item.yaml +++ b/functions/src/open_archive/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.8.0-rc50 diff --git a/functions/src/pii_recognizer/item.yaml b/functions/src/pii_recognizer/item.yaml index 8f3185b4c..dcd71c85c 100644 --- a/functions/src/pii_recognizer/item.yaml +++ b/functions/src/pii_recognizer/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-08-15:10-24 hidden: false icon: '' labels: - author: pgw + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/pyannote_audio/item.yaml b/functions/src/pyannote_audio/item.yaml index b6dbccddb..79a5a0f1b 100644 --- a/functions/src/pyannote_audio/item.yaml +++ b/functions/src/pyannote_audio/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-12-03:14-30 hidden: false icon: '' labels: - author: guyl + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/question_answering/item.yaml b/functions/src/question_answering/item.yaml index 741bab80c..b307a9877 100755 --- a/functions/src/question_answering/item.yaml +++ b/functions/src/question_answering/item.yaml @@ -8,7 +8,7 @@ generationDate: 2023-08-07:11-30 hidden: false icon: '' labels: - author: yonish + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/send_email/function.yaml b/functions/src/send_email/function.yaml index e895cddc9..1722fb586 100644 --- a/functions/src/send_email/function.yaml +++ b/functions/src/send_email/function.yaml @@ -5,7 +5,7 @@ metadata: hash: 5c4528084ea98992b77f65e29359bbcb4a0df8ab project: '' labels: - author: saarc + author: Iguazio categories: - utils spec: diff --git a/functions/src/send_email/item.yaml b/functions/src/send_email/item.yaml index 4c42cb73b..6caf1ab50 100644 --- a/functions/src/send_email/item.yaml +++ b/functions/src/send_email/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: saarc + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.4.1 diff --git a/functions/src/silero_vad/item.yaml b/functions/src/silero_vad/item.yaml index 49adfcd9f..7a1aeaee2 100644 --- a/functions/src/silero_vad/item.yaml +++ b/functions/src/silero_vad/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-12-03:14-30 hidden: false icon: '' labels: - author: guyl + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/sklearn_classifier/item.yaml b/functions/src/sklearn_classifier/item.yaml index 1b41e630a..b9726fb79 100644 --- a/functions/src/sklearn_classifier/item.yaml +++ b/functions/src/sklearn_classifier/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: true icon: '' labels: - author: yjb + author: Iguazio framework: sklearn maintainers: [] marketplaceType: '' diff --git a/functions/src/sklearn_classifier_dask/function.yaml b/functions/src/sklearn_classifier_dask/function.yaml index 98be06b8c..46f733886 100644 --- a/functions/src/sklearn_classifier_dask/function.yaml +++ b/functions/src/sklearn_classifier_dask/function.yaml @@ -5,7 +5,7 @@ metadata: hash: e542038fbb84f790b7144b529665f36d70d80906 project: '' labels: - author: yjb + author: Iguazio framework: sklearn categories: - machine-learning diff --git a/functions/src/sklearn_classifier_dask/item.yaml b/functions/src/sklearn_classifier_dask/item.yaml index 35e89b2dd..3264ec681 100644 --- a/functions/src/sklearn_classifier_dask/item.yaml +++ b/functions/src/sklearn_classifier_dask/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: true icon: '' labels: - author: yjb + author: Iguazio framework: sklearn maintainers: [] marketplaceType: '' diff --git a/functions/src/structured_data_generator/item.yaml b/functions/src/structured_data_generator/item.yaml index 6e01aefb9..f268f05e6 100755 --- a/functions/src/structured_data_generator/item.yaml +++ b/functions/src/structured_data_generator/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-12-14:10-50 hidden: false icon: '' labels: - author: zeevr + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.8.0 diff --git a/functions/src/test_classifier/function.yaml b/functions/src/test_classifier/function.yaml index d0e1b6067..f35446b51 100644 --- a/functions/src/test_classifier/function.yaml +++ b/functions/src/test_classifier/function.yaml @@ -5,7 +5,7 @@ metadata: hash: b4d447a2328975e90a0dbc7a28f82009924cc157 project: '' labels: - author: yjb + author: Iguazio framework: sklearn categories: - machine-learning diff --git a/functions/src/test_classifier/item.yaml b/functions/src/test_classifier/item.yaml index e9f4982a9..a38497a73 100644 --- a/functions/src/test_classifier/item.yaml +++ b/functions/src/test_classifier/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: true icon: '' labels: - author: yjb + author: Iguazio framework: sklearn maintainers: [] marketplaceType: '' diff --git a/functions/src/text_to_audio_generator/item.yaml b/functions/src/text_to_audio_generator/item.yaml index ff9ec379f..13beef4b9 100644 --- a/functions/src/text_to_audio_generator/item.yaml +++ b/functions/src/text_to_audio_generator/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-12-03:15-30 hidden: false icon: '' labels: - author: yonatans + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.1 diff --git a/functions/src/tf2_serving/function.yaml b/functions/src/tf2_serving/function.yaml index c755263ae..17cf2fbb9 100644 --- a/functions/src/tf2_serving/function.yaml +++ b/functions/src/tf2_serving/function.yaml @@ -4,7 +4,7 @@ metadata: hash: 134293b94996e74275d90546f8d4ef96198af679 project: '' labels: - author: yaronh + author: Iguazio categories: - model-serving - machine-learning diff --git a/functions/src/tf2_serving/item.yaml b/functions/src/tf2_serving/item.yaml index 88dac8478..d7c793364 100644 --- a/functions/src/tf2_serving/item.yaml +++ b/functions/src/tf2_serving/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/functions/src/transcribe/item.yaml b/functions/src/transcribe/item.yaml index 6deaf710a..0bc9e5d0f 100644 --- a/functions/src/transcribe/item.yaml +++ b/functions/src/transcribe/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-07-13:11-20 hidden: false icon: '' labels: - author: yonatans + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/translate/item.yaml b/functions/src/translate/item.yaml index 839d1efaa..eb0e821e4 100644 --- a/functions/src/translate/item.yaml +++ b/functions/src/translate/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-12-05:17-20 hidden: false icon: '' labels: - author: guyl + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/v2_model_server/function.yaml b/functions/src/v2_model_server/function.yaml index 45d261b6a..5ecfec9ba 100644 --- a/functions/src/v2_model_server/function.yaml +++ b/functions/src/v2_model_server/function.yaml @@ -5,7 +5,7 @@ metadata: hash: ad85919d3b9cf2acae43a3434ba56e01b005755e project: '' labels: - author: yaronh + author: Iguazio framework: sklearn categories: - model-serving diff --git a/functions/src/v2_model_server/item.yaml b/functions/src/v2_model_server/item.yaml index 7bde91a64..4beda6243 100644 --- a/functions/src/v2_model_server/item.yaml +++ b/functions/src/v2_model_server/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio framework: sklearn maintainers: [] marketplaceType: '' diff --git a/functions/src/v2_model_tester/function.yaml b/functions/src/v2_model_tester/function.yaml index 518bd1492..c9562b097 100644 --- a/functions/src/v2_model_tester/function.yaml +++ b/functions/src/v2_model_tester/function.yaml @@ -5,7 +5,7 @@ metadata: hash: 72d3f664ff2aa870109e44f52f975bda2ac13682 project: '' labels: - author: yaronh + author: Iguazio categories: - model-testing - machine-learning diff --git a/functions/src/v2_model_tester/item.yaml b/functions/src/v2_model_tester/item.yaml index ce1ecef5f..c3412fc5c 100644 --- a/functions/src/v2_model_tester/item.yaml +++ b/functions/src/v2_model_tester/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/modules/README.md b/modules/README.md index a14dbf5bb..05e7cfefd 100644 --- a/modules/README.md +++ b/modules/README.md @@ -7,4 +7,5 @@ | Name | Description | Kind | Categories | | --- | --- | --- | --- | | [count_events](/home/runner/work/functions/functions/modules/src/count_events) | Count events in each time window | monitoring_application | model-serving | +| [histogram_data_drift](/home/runner/work/functions/functions/modules/src/histogram_data_drift) | Model-monitoring application for detecting and visualizing data drift | monitoring_application | model-serving, structured-ML | diff --git a/modules/src/count_events/count_events.ipynb b/modules/src/count_events/count_events.ipynb index 54f657bb0..8a3cac849 100644 --- a/modules/src/count_events/count_events.ipynb +++ b/modules/src/count_events/count_events.ipynb @@ -1,35 +1,829 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "2f5aea66-03d3-4ba2-a0cb-3e74e8376ff0", + "metadata": {}, + "source": [ + "# Count Events Demo" + ] + }, + { + "cell_type": "markdown", + "id": "cdadd95e-d65f-4910-b72f-ef545c09c96b", + "metadata": {}, + "source": [ + "## Overview" + ] + }, + { + "cell_type": "markdown", + "id": "c336160a-3eba-40b3-8d02-7849ca74925b", + "metadata": {}, + "source": [ + "This notebook walks through a simple example of how to monitor a real-time serving function and how to add your a custom monitoring application from the hub.\n", + "For simplicity, we’ll use the Count Events application, which calculates the number of requests in each time window.\n", + "If you’d like to create your own model monitoring application (which can later be added to the hub), follow these instructions:https://docs.mlrun.org/en/stable/model-monitoring/applications.html\n", + "\n", + "To add a model monitoring application to your project from the hub, you can choose one of two approaches:\n", + "1. **Set it directly** – the application will be deployed as is.\n", + "2. **Import it as a module** – this lets you test and modify the application code before deploying it.\n" + ] + }, + { + "cell_type": "markdown", + "id": "1bcc90b4-f3c3-46ea-8348-1e7239e4e6e0", + "metadata": {}, + "source": [ + "## Demo" + ] + }, + { + "cell_type": "markdown", + "id": "2761fb6c-2c9d-4e8c-8efd-e01762b3bb22", + "metadata": {}, + "source": [ + "### Create a project" + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "initial_id", + "execution_count": 1, + "id": "e06ac3e1-8afd-45ab-9448-f664a4e54640", "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-05 15:33:39,611 [warning] Failed resolving version info. Ignoring and using defaults\n", + "> 2025-11-05 15:33:43,049 [warning] Server or client version is unstable. Assuming compatible: {\"client_version\":\"0.0.0+unstable\",\"server_version\":\"1.11.0\"}\n", + "> 2025-11-05 15:33:58,614 [info] Created and saved project: {\"context\":\"./\",\"from_template\":null,\"name\":\"count-events-demo\",\"overwrite\":false,\"save\":true}\n", + "> 2025-11-05 15:33:58,616 [info] Project created successfully: {\"project_name\":\"count-events-demo\",\"stored_in_db\":true}\n" + ] + } + ], + "source": [ + "import mlrun\n", + "project = mlrun.get_or_create_project(\"count-events-demo\",'./')" + ] + }, + { + "cell_type": "markdown", + "id": "cb0c365d-243f-447d-a693-38007d38329a", + "metadata": {}, + "source": [ + "### Generate datastore profiles for model monitoring\n", + "Before you enable model monitoring, you must configure datastore profiles for TSDB and streaming endpoints. A datastore profile holds all the information required to address an external data source, including credentials.\n", + "Model monitoring supports Kafka and V3IO as streaming platforms, and TDEngine and V3IO as TSDB platforms.\n", + "\n", + "In this example we will use V3IO for both streaming and TSDB platforms." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "10df799e-0e63-409c-a204-551635c90410", + "metadata": {}, + "outputs": [], + "source": [ + "from mlrun.datastore.datastore_profile import (\n", + " DatastoreProfileV3io\n", + ")\n", + "\n", + "v3io_profile = DatastoreProfileV3io(name=\"v3io_profile\", v3io_access_key=mlrun.mlconf.get_v3io_access_key())\n", + "\n", + "project.register_datastore_profile(v3io_profile)\n", + "project.set_model_monitoring_credentials(stream_profile_name=v3io_profile.name, tsdb_profile_name=v3io_profile.name)" + ] + }, + { + "cell_type": "markdown", + "id": "94af15ae-b250-4583-950d-b14876065b8a", + "metadata": {}, + "source": [ + "### Deploy model monitoring infrastructure" + ] + }, + { + "cell_type": "markdown", + "id": "56b2adf8-dd65-4ee1-bf18-cd97eeb129b8", + "metadata": {}, + "source": [ + "Once you’ve provided the model monitoring credentials, you can enable monitoring capabilities for your project. \n", + "Visit MLRun's [Model Monitoring Architecture](https://docs.mlrun.org/en/stable/model-monitoring/index.html#model-monitoring-des) to read more." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a83f95bc-e6b5-4184-84cd-d3117f394b1c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2025-11-05 15:41:01 (info) Deploying function\n", + "2025-11-05 15:41:01 (info) Building\n", + "2025-11-05 15:41:01 (info) Staging files and preparing base images\n", + "2025-11-05 15:41:01 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2025-11-05 15:41:02 (info) Building processor image\n", + "2025-11-05 15:42:57 (info) Build complete\n", + "2025-11-05 15:43:07 (info) Function deploy complete\n", + "2025-11-05 15:40:57 (info) Deploying function\n", + "2025-11-05 15:40:57 (info) Building\n", + "2025-11-05 15:40:58 (info) Staging files and preparing base images\n", + "2025-11-05 15:40:58 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2025-11-05 15:40:58 (info) Building processor image\n", + "2025-11-05 15:42:53 (info) Build complete\n", + "2025-11-05 15:43:12 (info) Function deploy complete\n", + "2025-11-05 15:40:59 (info) Deploying function\n", + "2025-11-05 15:40:59 (info) Building\n", + "2025-11-05 15:40:59 (info) Staging files and preparing base images\n", + "2025-11-05 15:40:59 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2025-11-05 15:41:00 (info) Building processor image\n", + "2025-11-05 15:42:55 (info) Build complete\n", + "2025-11-05 15:43:03 (info) Function deploy complete\n" + ] + } + ], + "source": [ + "project.enable_model_monitoring(base_period=10, \n", + " deploy_histogram_data_drift_app=False, # built-in monitoring application for structured data \n", + " wait_for_deployment=True)" + ] + }, + { + "cell_type": "markdown", + "id": "e9f4186b-6f8f-479e-a603-d270397dd9ff", + "metadata": {}, + "source": [ + "### Log Models" + ] + }, + { + "cell_type": "markdown", + "id": "310fed55-3f62-4af8-800f-4fb2dccfe2fd", + "metadata": { + "tags": [] + }, + "source": [ + "We’ll generate some dummy classification models and log them to the project." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "fafcec2f-75d1-4af0-bbe0-b796367c48be", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import make_classification\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LinearRegression\n", + "import pickle\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6cabd9aa-87f2-4af7-a5c6-ea0417ceb33f", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare a model and generate training set\n", + "\n", + "X,y = make_classification(n_samples=200,n_features=5,random_state=42)\n", + "X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=42)\n", + "model = LinearRegression()\n", + "model.fit(X_train,y_train)\n", + "X_test = pd.DataFrame(X_test,columns=[f\"column_{i}\" for i in range(5)])\n", + "y_test = pd.DataFrame(y_test,columns=[\"label\"])\n", + "training_set = pd.concat([X_test,y_test],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3afde46a-9f26-4438-bedb-acad15866b03", + "metadata": {}, + "outputs": [], + "source": [ + "# Log your models\n", + "for i in range(5):\n", + " project.log_model(key=f\"model_{i}\",body=pickle.dumps(model),model_file=f'model.pkl',training_set=training_set,label_column=\"label\")" + ] + }, + { + "cell_type": "markdown", + "id": "49d820b1-9fd7-4184-9005-25d69578c995", + "metadata": {}, + "source": [ + "### Deploy Serving Function" + ] + }, + { + "cell_type": "markdown", + "id": "19fd7570-3f91-45ff-ba2b-4aebce4a95b4", + "metadata": {}, + "source": [ + "We’ll use a basic serving function and enrich it with the logged models.\n", + "\n", + "\n", + "Note that if you want to monitor a serving function along with its associated models, you must enable tracking by calling `set_tracking()`. Otherwise, the serving function’s requests won’t be monitored." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cb806c5b-a0a0-4deb-a63d-f2ea72dc3e02", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the serving\n", + "serving = mlrun.new_function('serving-model-v1',kind='serving')\n", + "graph = serving.set_topology(\"router\", engine=\"sync\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "93ee54ec-0c4a-4eb1-8bc3-d065aec64c8f", + "metadata": {}, + "outputs": [], + "source": [ + "# Apply monitoring\n", + "serving.set_tracking()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f162a254-00ce-4c8a-89df-0cf5d25da5b1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:00<00:00, 22052.07it/s]\n" + ] + } + ], + "source": [ + "# Add models to your serving\n", + "models_uri = [model.uri for model in project.list_models(tag=\"latest\")]\n", + "i=0\n", + "from tqdm import tqdm\n", + "for uri in tqdm(models_uri):\n", + " serving.add_model(key=f'model_{i}',model_path=uri,class_name='mlrun.frameworks.sklearn.SKLearnModelServer')\n", + " i+=1" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ff91f360-5c85-4bc7-a3c3-80a31f1ebd3c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-05 15:55:08,989 [info] Starting remote function deploy\n", + "2025-11-05 15:55:09 (info) Deploying function\n", + "2025-11-05 15:55:09 (info) Building\n", + "2025-11-05 15:55:09 (info) Staging files and preparing base images\n", + "2025-11-05 15:55:09 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2025-11-05 15:55:09 (info) Building processor image\n", + "2025-11-05 15:56:54 (info) Build complete\n", + "2025-11-05 15:57:06 (info) Function deploy complete\n", + "> 2025-11-05 15:57:10,181 [info] Model endpoint creation task completed with state succeeded\n", + "> 2025-11-05 15:57:10,181 [info] Successfully deployed function: {\"external_invocation_urls\":[\"count-events-demo-serving-model-v1.default-tenant.app.vmdev211.lab.iguazeng.com/\"],\"internal_invocation_urls\":[\"nuclio-count-events-demo-serving-model-v1.default-tenant.svc.cluster.local:8080\"]}\n" + ] + } + ], + "source": [ + "# Deploy serving\n", + "serving_function = project.deploy_function(serving)" + ] + }, + { + "cell_type": "markdown", + "id": "1652a010-e086-4c62-9493-1a82bc125ad4", + "metadata": {}, + "source": [ + "### Invoke Serving" + ] + }, + { + "cell_type": "markdown", + "id": "4c937193-27bc-4b6f-bc1d-cf7472045778", + "metadata": {}, + "source": [ + "Let’s generate some dummy data and invoke our serving function." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "66f469db-9f5b-4e3d-bc85-160a9c90bc8f", + "metadata": {}, "outputs": [], "source": [ - "" + "serving = project.get_function(\"serving-model-v1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "50305c3e-bd1b-4240-9c63-9851173af75e", + "metadata": {}, + "outputs": [], + "source": [ + "inputs = [[-0.51,0.051,0.6287761723991921,-0.8751269647375463,-1.0660002219502747], [-0.51,0.051,0.6287761723991921,-0.8751269647375463,-1.0660002219502747], [-0.51,0.051,0.6287761723991921,-0.8751269647375463,-1.0660002219502747], [-0.51,0.051,0.6287761723991921,-0.8751269647375463,-1.0660002219502747], [-0.51,0.051,0.6287761723991921,-0.8751269647375463,-1.0660002219502747]]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9e8372d6-4fa7-4b45-8932-1f690b55048c", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "for i in range(5):\n", + " for j in range(100):\n", + " serving.invoke(f\"/v2/models/model_{i}/infer\", {\"inputs\": inputs})" + ] + }, + { + "cell_type": "markdown", + "id": "4eeb44e1-9c1a-430a-b978-f58f1adeaa12", + "metadata": {}, + "source": [ + "# Evaluate App" + ] + }, + { + "cell_type": "markdown", + "id": "936afba8-c06b-4141-a85e-5cbc9d32aa45", + "metadata": {}, + "source": [ + "Before deploying the Count Events application, let’s first test it to make sure it works as expected. We’ll import it as a module, which downloads the module file to your local filesystem, and then run it as a job using the `evaluate` mechanism." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "213425d1-8470-483e-b325-14aaa991c8c5", + "metadata": {}, + "outputs": [], + "source": [ + "# Import count events from the hub\n", + "count_events_app = mlrun.import_module(\"hub://count_events\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d91450e4-effb-4963-b913-dcd9829e78b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-05 15:57:37,746 [info] Changing function name - adding `\"-batch\"` suffix: {\"func_name\":\"countapp-batch\"}\n", + "> 2025-11-05 15:57:37,927 [info] Storing function: {\"db\":\"http://mlrun-api:8080\",\"name\":\"countapp-batch--handler\",\"uid\":\"b7c240fd99ed4c9b940db6a587a53b80\"}\n", + "> 2025-11-05 15:57:38,202 [info] Job is running in the background, pod: countapp-batch--handler-469fm\n", + "> 2025-11-05 15:57:42,390 [info] Counted events for model endpoint window: {\"count\":4,\"end\":\"NaT\",\"model_endpoint_name\":\"model_0\",\"start\":\"NaT\"}\n", + "> 2025-11-05 15:57:42,498 [info] To track results use the CLI: {\"info_cmd\":\"mlrun get run b7c240fd99ed4c9b940db6a587a53b80 -p count-events-demo\",\"logs_cmd\":\"mlrun logs b7c240fd99ed4c9b940db6a587a53b80 -p count-events-demo\"}\n", + "> 2025-11-05 15:57:42,498 [info] Or click for UI: {\"ui_url\":\"https://dashboard.default-tenant.app.vmdev211.lab.iguazeng.com/mlprojects/count-events-demo/jobs/monitor-jobs/countapp-batch--handler/b7c240fd99ed4c9b940db6a587a53b80/overview\"}\n", + "> 2025-11-05 15:57:42,499 [info] Run execution finished: {\"name\":\"countapp-batch--handler\",\"status\":\"completed\"}\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartendstatekindnamelabelsinputsparametersresults
count-events-demo0Nov 05 15:57:412025-11-05 15:57:42.474376+00:00completedruncountapp-batch--handler
v3io_user=iguazio
kind=job
owner=iguazio
mlrun/client_version=0.0.0+unstable
mlrun/client_python_version=3.11.12
host=countapp-batch--handler-469fm
sample_data
endpoints=['model_0']
write_output=False
existing_data_handling=fail_on_overlap
stream_profile=None
model_0-d25a6714a19b4027b9bccfe8adca8ddc_NaT_NaT={'metric_name': 'count', 'metric_value': 4.0}
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-05 15:57:46,373 [info] Run execution finished: {\"name\":\"countapp-batch--handler\",\"status\":\"completed\"}\n" + ] + } + ], + "source": [ + "# Run the app as a job\n", + "res = count_events_app.CountApp.evaluate(func_path=\"count_events.py\",\n", + " run_local=False,\n", + " sample_data=pd.DataFrame({\"col\": [1, 2, 3, 4]}),\n", + " image=image,\n", + " endpoints=[\"model_0\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "504adb0b-6ccf-421c-98fc-25ed1a8691e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'model_0-d25a6714a19b4027b9bccfe8adca8ddc_NaT_NaT': {'metric_name': 'count',\n", + " 'metric_value': 4.0}}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res.outputs" + ] + }, + { + "cell_type": "markdown", + "id": "3a05a1c9-b62d-470a-9e18-4c3f5ca61b91", + "metadata": {}, + "source": [ + "Now that the application is available on your filesystem, you can register and deploy it just like any other custom application." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "28bc9645-69b2-418d-a5c5-7ba94f64745f", + "metadata": {}, + "outputs": [], + "source": [ + "fn = project.set_model_monitoring_function(\n", + " func=\"count_events.py\",\n", + " application_class=\"CountApp\",\n", + " name=\"CountEventsFromFile\",\n", + " image=image,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f318f85f-76d8-4494-8029-870edf54df6b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-05 16:09:48,293 [info] Starting remote function deploy\n", + "2025-11-05 16:09:48 (info) Deploying function\n", + "2025-11-05 16:09:48 (info) Building\n", + "2025-11-05 16:09:48 (info) Staging files and preparing base images\n", + "2025-11-05 16:09:48 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2025-11-05 16:09:48 (info) Building processor image\n", + "2025-11-05 16:11:33 (info) Build complete\n", + "2025-11-05 16:11:41 (info) Function deploy complete\n", + "> 2025-11-05 16:11:49,604 [info] Model endpoint creation task completed with state succeeded\n", + "> 2025-11-05 16:11:49,605 [info] Successfully deployed function: {\"external_invocation_urls\":[],\"internal_invocation_urls\":[\"nuclio-count-events-demo-counteventsfromfile.default-tenant.svc.cluster.local:8080\"]}\n" + ] + }, + { + "data": { + "text/plain": [ + "DeployStatus(state=ready, outputs={'endpoint': 'http://nuclio-count-events-demo-counteventsfromfile.default-tenant.svc.cluster.local:8080', 'name': 'count-events-demo-counteventsfromfile'})" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.deploy_function(fn)" + ] + }, + { + "cell_type": "markdown", + "id": "d2b527ee-19e6-4f89-9e51-702fa1707986", + "metadata": {}, + "source": [ + "## Set Application from Hub" + ] + }, + { + "cell_type": "markdown", + "id": "b8fa2433-535c-498b-a7ee-3d82d474d447", + "metadata": {}, + "source": [ + "As mentioned, you can set the application directly from the hub by providing a valid hub path (`hub://`)." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "61c50ac6-8dac-41a2-bb9c-705ab543e234", + "metadata": {}, + "outputs": [], + "source": [ + "fn = project.set_model_monitoring_function(\n", + " func=\"hub://count_events\",\n", + " application_class=\"CountApp\",\n", + " name=\"CountEvents\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "df313a94-d742-4ff6-8a28-8390322b8074", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-05 15:57:58,659 [info] Starting remote function deploy\n", + "2025-11-05 15:57:59 (info) Deploying function\n", + "2025-11-05 15:57:59 (info) Building\n", + "2025-11-05 15:57:59 (info) Staging files and preparing base images\n", + "2025-11-05 15:57:59 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2025-11-05 15:57:59 (info) Building processor image\n", + "2025-11-05 15:59:34 (info) Build complete\n", + "2025-11-05 15:59:42 (info) Function deploy complete\n", + "> 2025-11-05 15:59:49,826 [info] Model endpoint creation task completed with state succeeded\n", + "> 2025-11-05 15:59:49,827 [info] Successfully deployed function: {\"external_invocation_urls\":[],\"internal_invocation_urls\":[\"nuclio-count-events-demo-countevents.default-tenant.svc.cluster.local:8080\"]}\n" + ] + }, + { + "data": { + "text/plain": [ + "DeployStatus(state=ready, outputs={'endpoint': 'http://nuclio-count-events-demo-countevents.default-tenant.svc.cluster.local:8080', 'name': 'count-events-demo-countevents'})" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.deploy_function(fn)" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "mlrun-base-py311", "language": "python", - "name": "python3" + "name": "conda-env-mlrun-base-py311-py" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.11.12" } }, "nbformat": 4, diff --git a/modules/src/count_events/count_events.py b/modules/src/count_events/count_events.py index c2d6444e4..1c6d97621 100644 --- a/modules/src/count_events/count_events.py +++ b/modules/src/count_events/count_events.py @@ -20,9 +20,20 @@ class CountApp(ModelMonitoringApplicationBase): + """ + Model Monitoring Application that counts the number of events in the given time window. + """ def do_tracking( - self, monitoring_context: mm_context.MonitoringApplicationContext + self, + monitoring_context: mm_context.MonitoringApplicationContext ) -> ModelMonitoringApplicationMetric: + """" + he do_tracking method implementation for the CountApp class. + It counts the number of events in the sample data-frame and logs the count. + + :param monitoring_context: The monitoring application context. It includes the current window data as a + pandas data-frame: monitoring_context.sample_df. + """ sample_df = monitoring_context.sample_df monitoring_context.logger.debug("Sample data-frame", sample_df=sample_df) count = len(sample_df) diff --git a/modules/src/count_events/item.yaml b/modules/src/count_events/item.yaml index e0eb09069..049651ddb 100644 --- a/modules/src/count_events/item.yaml +++ b/modules/src/count_events/item.yaml @@ -6,12 +6,12 @@ example: count_events.ipynb generationDate: 2025-09-16:12-25 hidden: false labels: - author: iguazio -mlrunVersion: 1.10.0-rc27 + author: Iguazio +mlrunVersion: 1.10.0-rc41 name: count_events spec: filename: count_events.py image: mlrun/mlrun kind: monitoring_application requirements: -version: 1.0.0 +version: 1.0.0 \ No newline at end of file diff --git a/modules/src/count_events/requirements.txt b/modules/src/count_events/requirements.txt index 89741402a..0c107c276 100644 --- a/modules/src/count_events/requirements.txt +++ b/modules/src/count_events/requirements.txt @@ -1,3 +1,3 @@ -mlrun==1.10.0-rc27 +mlrun==1.10.0-rc41 pandas==2.1.4 -pytest~=8.2 +pytest~=8.2 \ No newline at end of file diff --git a/modules/src/histogram_data_drift/assets/feature_stats.csv b/modules/src/histogram_data_drift/assets/feature_stats.csv new file mode 100644 index 000000000..de76ff176 --- /dev/null +++ b/modules/src/histogram_data_drift/assets/feature_stats.csv @@ -0,0 +1,23 @@ +,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm +0,0.0,0.0,0.0,0.0 +1,0.02666666666666667,0.006666666666666667,0.02666666666666667,0.22666666666666666 +2,0.03333333333333333,0.02,0.22,0.04666666666666667 +3,0.04666666666666667,0.02666666666666667,0.07333333333333333,0.04666666666666667 +4,0.10666666666666667,0.02,0.013333333333333334,0.006666666666666667 +5,0.06,0.05333333333333334,0.0,0.006666666666666667 +6,0.03333333333333333,0.09333333333333334,0.0,0.0 +7,0.08666666666666667,0.09333333333333334,0.006666666666666667,0.0 +8,0.09333333333333334,0.06666666666666667,0.013333333333333334,0.04666666666666667 +9,0.06666666666666667,0.17333333333333334,0.02,0.02 +10,0.04,0.07333333333333333,0.03333333333333333,0.03333333333333333 +11,0.06666666666666667,0.12666666666666668,0.08,0.14 +12,0.10666666666666667,0.08,0.09333333333333334,0.08 +13,0.04666666666666667,0.04,0.08,0.02666666666666667 +14,0.07333333333333333,0.02666666666666667,0.11333333333333333,0.013333333333333334 +15,0.02666666666666667,0.06,0.04,0.08 +16,0.013333333333333334,0.013333333333333334,0.08,0.07333333333333333 +17,0.02666666666666667,0.006666666666666667,0.04666666666666667,0.04 +18,0.006666666666666667,0.006666666666666667,0.02666666666666667,0.02 +19,0.03333333333333333,0.006666666666666667,0.013333333333333334,0.05333333333333334 +20,0.006666666666666667,0.006666666666666667,0.02,0.04 +21,0.0,0.0,0.0,0.0 diff --git a/modules/src/histogram_data_drift/assets/sample_df_stats.csv b/modules/src/histogram_data_drift/assets/sample_df_stats.csv new file mode 100644 index 000000000..dc02ef3ba --- /dev/null +++ b/modules/src/histogram_data_drift/assets/sample_df_stats.csv @@ -0,0 +1,23 @@ +,p0,petal_length_cm,petal_width_cm,sepal_length_cm,sepal_width_cm +0,0.0,1.0,1.0,1.0,1.0 +1,0.0,0.0,0.0,0.0,0.0 +2,0.0,0.0,0.0,0.0,0.0 +3,0.0,0.0,0.0,0.0,0.0 +4,0.0,0.0,0.0,0.0,0.0 +5,0.0,0.0,0.0,0.0,0.0 +6,0.0,0.0,0.0,0.0,0.0 +7,0.0,0.0,0.0,0.0,0.0 +8,0.0,0.0,0.0,0.0,0.0 +9,0.0,0.0,0.0,0.0,0.0 +10,0.0,0.0,0.0,0.0,0.0 +11,1.0,0.0,0.0,0.0,0.0 +12,0.0,0.0,0.0,0.0,0.0 +13,0.0,0.0,0.0,0.0,0.0 +14,0.0,0.0,0.0,0.0,0.0 +15,0.0,0.0,0.0,0.0,0.0 +16,0.0,0.0,0.0,0.0,0.0 +17,0.0,0.0,0.0,0.0,0.0 +18,0.0,0.0,0.0,0.0,0.0 +19,0.0,0.0,0.0,0.0,0.0 +20,0.0,0.0,0.0,0.0,0.0 +21,0.0,0.0,0.0,0.0,0.0 diff --git a/modules/src/histogram_data_drift/histogram_data_drift.ipynb b/modules/src/histogram_data_drift/histogram_data_drift.ipynb new file mode 100644 index 000000000..eceb28ca3 --- /dev/null +++ b/modules/src/histogram_data_drift/histogram_data_drift.ipynb @@ -0,0 +1,309 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "283b6000-4acd-4eb3-bf51-25ee79e9e5dc", + "metadata": {}, + "source": [ + "# Histogram Data Drift Demo\n", + "The Histogram Data Drift monitoring app is MLRun’s default data drift application for model monitoring. It’s considered a built-in app within the model monitoring flow and is deployed by default when model monitoring is enabled for a project. For more information, see the [MLRun documentation](https://docs.mlrun.org/en/latest/model-monitoring/index.html#model-monitoring-applications).\n", + "\n", + "This notebook walks through a simple example of using this app from the hub to monitor data drift between a baseline dataset and a new dataset, using the `evaluate()` method." + ] + }, + { + "cell_type": "markdown", + "id": "da432405-e8bb-400c-b1e0-45e31b0571f1", + "metadata": {}, + "source": [ + "## Set up a project and prepare the data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "62fcc7a4-4df5-4f2e-bd97-6aa831bbf958", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n", + "project = mlrun.get_or_create_project(\"histogram-data-drift-demo\",'./histogram-data-drift-demo')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d7ec1628-0303-4bbb-ba34-5cd96eaef304", + "metadata": {}, + "outputs": [], + "source": [ + "sample_data = mlrun.get_sample_path(\"data/batch-predict/training_set.parquet\")\n", + "reference_data = mlrun.get_sample_path(\"data/batch-predict/prediction_set.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "072f1411-33a2-444e-88bf-76d9394d7877", + "metadata": {}, + "source": [ + "## Get the module from the hub and edit its defaults" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5c04dec9-ea6e-410e-a36d-42a71a223caa", + "metadata": {}, + "outputs": [], + "source": [ + "hub_mod = mlrun.get_hub_module(\"hub://histogram_data_drift\", download_files=True)\n", + "src_file_path = hub_mod.get_module_file_path()" + ] + }, + { + "cell_type": "markdown", + "id": "ce26e487-bfe5-442c-9d5a-04a8d75407a6", + "metadata": {}, + "source": [ + "Since the histogram data drift application doesn’t produce artifacts by default, we need to modify the class defaults. This can be done in one of two ways: either by editing the downloaded source file directly and then evaluating with the standard class, or - as we’ll do now - by adding an inheriting class to the same file and evaluating using that new class." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "055a31d8-00fd-4f55-b07c-1169db6af919", + "metadata": {}, + "outputs": [], + "source": [ + "# add a declaration of an inheriting class to change the default parameters\n", + "wrapper_code = \"\"\"\n", + "class HistogramDataDriftApplicationWithArtifacts(HistogramDataDriftApplication):\n", + " # The same histogram application but with artifacts\n", + "\n", + " def __init__(self) -> None:\n", + " super().__init__(produce_json_artifact=True, produce_plotly_artifact=True)\n", + "\"\"\"\n", + "with open(src_file_path, \"a\") as f:\n", + " f.write(wrapper_code)" + ] + }, + { + "cell_type": "markdown", + "id": "c17b176b-f838-472f-aaeb-7cedaeb66b56", + "metadata": {}, + "source": [ + "Now we can actually import it as a module, using the `module()` method" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6f57d3c9-9e7e-4fde-b78b-2daf799893e1", + "metadata": {}, + "outputs": [], + "source": [ + "app_module = hub_mod.module()\n", + "hist_app = app_module.HistogramDataDriftApplicationWithArtifacts # or the standard class if you chose to modify its code" + ] + }, + { + "cell_type": "markdown", + "id": "a017bc5a-4935-456b-8648-57c11e11df27", + "metadata": {}, + "source": [ + "And we are ready to call `evaluate()` (notice that the run is linked to the current (active) project)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c20fc990-d0e6-4aab-a576-29cea322bfb5", + "metadata": {}, + "outputs": [], + "source": [ + "run_result = hist_app.evaluate(\n", + " func_path=hub_mod.get_module_file_path(),\n", + " sample_data=sample_data,\n", + " reference_data=reference_data,\n", + " run_local=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "661cdf4d-ee2a-4156-8a71-59f2a1e3b9eb", + "metadata": {}, + "source": [ + "## Examine the results" + ] + }, + { + "cell_type": "markdown", + "id": "e715b6aa-75c0-4352-b98f-bd5a790e1d06", + "metadata": {}, + "source": [ + "First, we'll print nicely the average results:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3688d6a0-6cae-4141-8851-dfd12842c484", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hellinger_mean : 0.34211088243167637\n", + "kld_mean : 2.2839485090490426\n", + "tvd_mean : 0.30536\n", + "general_drift : 0.3237354412158382\n" + ] + } + ], + "source": [ + "for i in range (3):\n", + " metric = run_result.status.results[\"return\"][i]\n", + " print(metric[\"metric_name\"], \": \", metric[\"metric_value\"])\n", + "result = run_result.status.results[\"return\"][3]\n", + "print(result[\"result_name\"], \": \", result[\"result_value\"])" + ] + }, + { + "cell_type": "markdown", + "id": "0422ca13-661b-4574-ad51-d1665be6acdb", + "metadata": {}, + "source": [ + "And we can also examine these metrics per feature, along with other metrics, using the artifacts the app generated for us.\n", + "\n", + "The rightmost column indicates whether the feature has drifted or not. The drift decision rule is the value per-feature mean of the Total Variance Distance (TVD) and Hellinger distance scores. In the histogram-data-drift application, the \"Drift detected\" threshold is 0.7 and the \"Drift suspected\" threshold is 0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d9e7e688-6a71-4b9b-8b99-b2d7f42077e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# The artifact is logged with the run's name\n", + "artifact_key = f\"{run_result.metadata.name}_drift_table_plot\"\n", + "artifact = project.get_artifact(artifact_key)\n", + "artifact.to_dataitem().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f8a17a07-6cc4-4bf3-abd8-187042b1973a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Drift value per feature:\n" + ] + }, + { + "data": { + "application/json": { + "feature_0": 0.034754757, + "feature_1": 0.0409220715, + "feature_10": 0.0529929347, + "feature_11": 0.7582778852, + "feature_12": 0.7680105477, + "feature_13": 0.0359189896, + "feature_14": 0.0388433161, + "feature_15": 0.6959895187, + "feature_16": 0.7682657628, + "feature_17": 0.0381781891, + "feature_18": 0.032682812, + "feature_19": 0.7400673333, + "feature_2": 0.7365591239, + "feature_3": 0.0492651761, + "feature_4": 0.0373909913, + "feature_5": 0.0374548709, + "feature_6": 0.7788618285, + "feature_7": 0.7443223594, + "feature_8": 0.0381141123, + "feature_9": 0.0478362439 + }, + "text/plain": [ + "" + ] + }, + "metadata": { + "application/json": { + "expanded": false, + "root": "root" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "print(\"Drift value per feature:\")\n", + "artifact_key = f\"{run_result.metadata.name}_features_drift_results\"\n", + "artifact = project.get_artifact(artifact_key)\n", + "artifact.to_dataitem().show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a8767ca-8a65-4841-9ced-4f36e86bb789", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base-py311", + "language": "python", + "name": "conda-env-mlrun-base-py311-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/modules/src/histogram_data_drift/histogram_data_drift.py b/modules/src/histogram_data_drift/histogram_data_drift.py new file mode 100644 index 000000000..b8cdcf299 --- /dev/null +++ b/modules/src/histogram_data_drift/histogram_data_drift.py @@ -0,0 +1,388 @@ +# Copyright 2024 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Final, Optional, Protocol, Union, cast + +import numpy as np +from pandas import DataFrame, Series + +import mlrun.artifacts +import mlrun.common.model_monitoring.helpers +import mlrun.model_monitoring.applications.context as mm_context +import mlrun.model_monitoring.applications.results as mm_results +import mlrun.model_monitoring.features_drift_table as mm_drift_table +from mlrun.common.schemas.model_monitoring.constants import ( + ResultKindApp, + ResultStatusApp, + StatsKind, +) +from mlrun.model_monitoring.applications import ( + ModelMonitoringApplicationBase, +) +from mlrun.model_monitoring.metrics.histogram_distance import ( + HellingerDistance, + HistogramDistanceMetric, + KullbackLeiblerDivergence, + TotalVarianceDistance, +) + + +class InvalidMetricValueError(ValueError): + pass + + +class InvalidThresholdValueError(ValueError): + pass + + +class ValueClassifier(Protocol): + def value_to_status(self, value: float) -> ResultStatusApp: ... + + +class HistogramDataDriftApplicationConstants: + NAME = "histogram-data-drift" + GENERAL_RESULT_NAME = "general_drift" + + +@dataclass +class DataDriftClassifier: + """ + Classify data drift numeric values into categorical status. + """ + + potential: float = 0.5 + detected: float = 0.7 + + def __post_init__(self) -> None: + """Catch erroneous threshold values""" + if not 0 < self.potential < self.detected < 1: + raise InvalidThresholdValueError( + "The provided thresholds do not comply with the rules" + ) + + def value_to_status(self, value: float) -> ResultStatusApp: + """ + Translate the numeric value into status category. + + :param value: The numeric value of the data drift metric, between 0 and 1. + :returns: `ResultStatusApp` according to the classification. + """ + if value > 1 or value < 0: + raise InvalidMetricValueError( + f"{value = } is invalid, must be in the range [0, 1]." + ) + if value >= self.detected: + return ResultStatusApp.detected + if value >= self.potential: + return ResultStatusApp.potential_detection + return ResultStatusApp.no_detection + + +class HistogramDataDriftApplication(ModelMonitoringApplicationBase): + """ + MLRun's default data drift application for model monitoring. + + The application expects tabular numerical data, and calculates three metrics over the shared features' histograms. + The metrics are calculated on features that have reference data from the training dataset. When there is no + reference data (`feature_stats`), this application send a warning log and does nothing. + The three metrics are: + + * Hellinger distance. + * Total variance distance. + * Kullback-Leibler divergence. + + Each metric is calculated over all the features individually and the mean is taken as the metric value. + The average of Hellinger and total variance distance is taken as the result. + + The application can log two artifacts (disabled by default due to performance issues): + + * JSON with the general drift value per feature. + * Plotly table with the various metrics and histograms per feature. + + If you want to change the application defaults, such as the classifier or which artifacts to produce, you + can either modify the downloaded source code file directly, or inherit from this class (in the same file), then + deploy it as any other model monitoring application. + Please make sure to keep the default application name. This ensures that the full functionality of the application, + including the statistics view in the UI, is available. + """ + + NAME: Final[str] = HistogramDataDriftApplicationConstants.NAME + + _REQUIRED_METRICS = {HellingerDistance, TotalVarianceDistance} + _STATS_TYPES: tuple[StatsKind, StatsKind] = ( + StatsKind.CURRENT_STATS, + StatsKind.DRIFT_MEASURES, + ) + + metrics: list[type[HistogramDistanceMetric]] = [ + HellingerDistance, + KullbackLeiblerDivergence, + TotalVarianceDistance, + ] + + def __init__( + self, + value_classifier: Optional[ValueClassifier] = None, + produce_json_artifact: bool = False, + produce_plotly_artifact: bool = False, + ) -> None: + """ + :param value_classifier: Classifier object that adheres to the :py:class:`~ValueClassifier` protocol. + If not provided, the default :py:class:`~DataDriftClassifier` is used. + :param produce_json_artifact: Whether to produce the JSON artifact or not, ``False`` by default. + :param produce_plotly_artifact: Whether to produce the Plotly artifact or not, ``False`` by default. + """ + self._value_classifier = value_classifier or DataDriftClassifier() + assert self._REQUIRED_METRICS <= set( + self.metrics + ), "TVD and Hellinger distance are required for the general data drift result" + + self._produce_json_artifact = produce_json_artifact + self._produce_plotly_artifact = produce_plotly_artifact + + def _compute_metrics_per_feature( + self, monitoring_context: mm_context.MonitoringApplicationContext + ) -> DataFrame: + """Compute the metrics for the different features and labels""" + metrics_per_feature = DataFrame( + columns=[metric_class.NAME for metric_class in self.metrics] + ) + feature_stats = monitoring_context.dict_to_histogram( + monitoring_context.feature_stats + ) + sample_df_stats = monitoring_context.dict_to_histogram( + monitoring_context.sample_df_stats + ) + for feature_name in feature_stats: + sample_hist = np.asarray(sample_df_stats[feature_name]) + reference_hist = np.asarray(feature_stats[feature_name]) + monitoring_context.logger.info( + "Computing metrics for feature", feature_name=feature_name + ) + metrics_per_feature.loc[feature_name] = { # pyright: ignore[reportCallIssue,reportArgumentType] + metric.NAME: metric( + distrib_t=sample_hist, distrib_u=reference_hist + ).compute() + for metric in self.metrics + } + monitoring_context.logger.info("Finished computing the metrics") + + return metrics_per_feature + + def _get_general_drift_result( + self, metrics: list[mm_results.ModelMonitoringApplicationMetric] + ) -> mm_results.ModelMonitoringApplicationResult: + """Get the general drift result from the metrics list""" + value = cast( + float, + np.mean( + [ + metric.value + for metric in metrics + if metric.name + in [ + f"{HellingerDistance.NAME}_mean", + f"{TotalVarianceDistance.NAME}_mean", + ] + ] + ), + ) + + status = self._value_classifier.value_to_status(value) + + return mm_results.ModelMonitoringApplicationResult( + name=HistogramDataDriftApplicationConstants.GENERAL_RESULT_NAME, + value=value, + kind=ResultKindApp.data_drift, + status=status, + ) + + @staticmethod + def _get_metrics( + metrics_per_feature: DataFrame, + ) -> list[mm_results.ModelMonitoringApplicationMetric]: + """Average the metrics over the features and add the status""" + metrics: list[mm_results.ModelMonitoringApplicationMetric] = [] + + metrics_mean = metrics_per_feature.mean().to_dict() + + for name, value in metrics_mean.items(): + metrics.append( + mm_results.ModelMonitoringApplicationMetric( + name=f"{name}_mean", + value=value, + ) + ) + + return metrics + + @staticmethod + def _get_stats( + metrics: list[mm_results.ModelMonitoringApplicationMetric], + metrics_per_feature: DataFrame, + monitoring_context: mm_context.MonitoringApplicationContext, + ) -> list[mm_results._ModelMonitoringApplicationStats]: + """ + Return a list of the statistics. + + :param metrics: the calculated metrics + :param metrics_per_feature: metric calculated per feature + :param monitoring_context: context object for current monitoring application + :returns: list of mm_results._ModelMonitoringApplicationStats for histogram data drift application + """ + stats = [] + for stats_type in HistogramDataDriftApplication._STATS_TYPES: + stats.append( + mm_results._ModelMonitoringApplicationStats( + name=stats_type, + stats=metrics_per_feature.T.to_dict() + | {metric.name: metric.value for metric in metrics} + if stats_type == StatsKind.DRIFT_MEASURES + else monitoring_context.sample_df_stats, + timestamp=monitoring_context.end_infer_time.isoformat( + sep=" ", timespec="microseconds" + ), + ) + ) + return stats + + @staticmethod + def _get_shared_features_sample_stats( + monitoring_context: mm_context.MonitoringApplicationContext, + ) -> mlrun.common.model_monitoring.helpers.FeatureStats: + """ + Filter out features without reference data in `feature_stats`, e.g. `timestamp`. + """ + return mlrun.common.model_monitoring.helpers.FeatureStats( + { + key: monitoring_context.sample_df_stats[key] + for key in monitoring_context.feature_stats + } + ) + + @staticmethod + def _log_json_artifact( + drift_per_feature_values: Series, + monitoring_context: mm_context.MonitoringApplicationContext, + ) -> None: + """Log the drift values as a JSON artifact""" + monitoring_context.logger.debug("Logging drift value per feature JSON artifact") + monitoring_context.log_artifact( + mlrun.artifacts.Artifact( + body=drift_per_feature_values.to_json(), + format="json", + key="features_drift_results", + ) + ) + monitoring_context.logger.debug("Logged JSON artifact successfully") + + def _log_plotly_table_artifact( + self, + sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats, + inputs_statistics: mlrun.common.model_monitoring.helpers.FeatureStats, + metrics_per_feature: DataFrame, + drift_per_feature_values: Series, + monitoring_context: mm_context.MonitoringApplicationContext, + ) -> None: + """Log the Plotly drift table artifact""" + monitoring_context.logger.debug( + "Feature stats", + sample_set_statistics=sample_set_statistics, + inputs_statistics=inputs_statistics, + ) + + monitoring_context.logger.debug("Computing drift results per feature") + drift_results = { + cast(str, key): (self._value_classifier.value_to_status(value), value) + for key, value in drift_per_feature_values.items() + } + monitoring_context.logger.debug("Producing plotly artifact") + artifact = mm_drift_table.FeaturesDriftTablePlot().produce( + sample_set_statistics=sample_set_statistics, + inputs_statistics=inputs_statistics, + metrics=metrics_per_feature.T.to_dict(), # pyright: ignore[reportArgumentType] + drift_results=drift_results, + ) + monitoring_context.logger.debug("Logging plotly artifact") + monitoring_context.log_artifact(artifact) + monitoring_context.logger.debug("Logged plotly artifact successfully") + + def _log_drift_artifacts( + self, + monitoring_context: mm_context.MonitoringApplicationContext, + metrics_per_feature: DataFrame, + ) -> None: + """Log JSON and Plotly drift data per feature artifacts""" + if not self._produce_json_artifact and not self._produce_plotly_artifact: + return + + drift_per_feature_values = metrics_per_feature[ + [HellingerDistance.NAME, TotalVarianceDistance.NAME] + ].mean(axis=1) + + if self._produce_json_artifact: + self._log_json_artifact(drift_per_feature_values, monitoring_context) + + if self._produce_plotly_artifact: + self._log_plotly_table_artifact( + sample_set_statistics=self._get_shared_features_sample_stats( + monitoring_context + ), + inputs_statistics=monitoring_context.feature_stats, + metrics_per_feature=metrics_per_feature, + drift_per_feature_values=drift_per_feature_values, + monitoring_context=monitoring_context, + ) + + def do_tracking( + self, monitoring_context: mm_context.MonitoringApplicationContext + ) -> list[ + Union[ + mm_results.ModelMonitoringApplicationResult, + mm_results.ModelMonitoringApplicationMetric, + mm_results._ModelMonitoringApplicationStats, + ] + ]: + """ + Calculate and return the data drift metrics, averaged over the features. + """ + monitoring_context.logger.debug("Starting to run the application") + if not monitoring_context.feature_stats: + monitoring_context.logger.warning( + "No feature statistics found, skipping the application. \n" + "In order to run the application, training set must be provided when logging the model." + ) + return [] + metrics_per_feature = self._compute_metrics_per_feature( + monitoring_context=monitoring_context + ) + monitoring_context.logger.debug("Saving artifacts") + self._log_drift_artifacts( + monitoring_context=monitoring_context, + metrics_per_feature=metrics_per_feature, + ) + monitoring_context.logger.debug("Computing average per metric") + metrics = self._get_metrics(metrics_per_feature) + result = self._get_general_drift_result(metrics=metrics) + stats = self._get_stats( + metrics=metrics, + monitoring_context=monitoring_context, + metrics_per_feature=metrics_per_feature, + ) + metrics_result_and_stats = metrics + [result] + stats + monitoring_context.logger.debug( + "Finished running the application", results=metrics_result_and_stats + ) + return metrics_result_and_stats diff --git a/modules/src/histogram_data_drift/item.yaml b/modules/src/histogram_data_drift/item.yaml new file mode 100644 index 000000000..e439e1699 --- /dev/null +++ b/modules/src/histogram_data_drift/item.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +categories: +- model-serving +- structured-ML +description: Model-monitoring application for detecting and visualizing data drift +example: histogram_data_drift.ipynb +generationDate: 2025-11-06 +hidden: false +labels: + author: Iguazio +mlrunVersion: 1.10.0-rc41 +name: histogram_data_drift +spec: + filename: histogram_data_drift.py + image: mlrun/mlrun + kind: monitoring_application + requirements: + - plotly~=5.23 + - pandas +version: 1.0.0 \ No newline at end of file diff --git a/modules/src/histogram_data_drift/requirements.txt b/modules/src/histogram_data_drift/requirements.txt new file mode 100644 index 000000000..4c3614d2b --- /dev/null +++ b/modules/src/histogram_data_drift/requirements.txt @@ -0,0 +1,3 @@ +hypothesis[numpy]~=6.103 +plotly~=5.23 +pandas \ No newline at end of file diff --git a/modules/src/histogram_data_drift/test_histogram_data_drift.py b/modules/src/histogram_data_drift/test_histogram_data_drift.py new file mode 100644 index 000000000..018edaa86 --- /dev/null +++ b/modules/src/histogram_data_drift/test_histogram_data_drift.py @@ -0,0 +1,279 @@ +# Copyright 2024 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from pathlib import Path +from unittest.mock import Mock + +import pandas as pd +import pytest +from hypothesis import given +from hypothesis import strategies as st + +import mlrun.common.model_monitoring.helpers +import mlrun.model_monitoring.applications +import mlrun.model_monitoring.applications.context as mm_context +import mlrun.utils +from mlrun.common.schemas.model_monitoring.constants import ( + ResultKindApp, + ResultStatusApp, +) +from histogram_data_drift import ( + DataDriftClassifier, + HistogramDataDriftApplication, + InvalidMetricValueError, + InvalidThresholdValueError, +) + +assets_folder = Path(__file__).parent / "assets" + + +@pytest.fixture +def project(tmp_path: Path) -> mlrun.MlrunProject: + project = mlrun.get_or_create_project("temp", allow_cross_project=True) + project.artifact_path = str(tmp_path) + return project + + +@pytest.fixture +def application() -> HistogramDataDriftApplication: + app = HistogramDataDriftApplication( + produce_json_artifact=True, produce_plotly_artifact=True + ) + return app + + +@pytest.fixture +def logger() -> mlrun.utils.Logger: + return mlrun.utils.Logger(level=logging.DEBUG, name="test_histogram_data_drift_app") + + +class TestDataDriftClassifier: + @staticmethod + @pytest.mark.parametrize( + ("potential", "detected"), [(0.4, 0.2), (0.0, 0.5), (0.7, 1.0), (-1, 2)] + ) + def test_invalid_threshold(potential: float, detected: float) -> None: + with pytest.raises(InvalidThresholdValueError): + DataDriftClassifier(potential=potential, detected=detected) + + @staticmethod + @given( + st.one_of( + st.floats(max_value=0, exclude_max=True), + st.floats(min_value=1, exclude_min=True), + ) + ) + def test_invalid_metric(value: float) -> None: + with pytest.raises(InvalidMetricValueError): + DataDriftClassifier().value_to_status(value) + + @staticmethod + @pytest.fixture + def classifier() -> DataDriftClassifier: + return DataDriftClassifier(potential=0.5, detected=0.7) + + @staticmethod + @pytest.mark.parametrize( + ("value", "expected_status"), + [ + (0, ResultStatusApp.no_detection), + (0.2, ResultStatusApp.no_detection), + (0.5, ResultStatusApp.potential_detection), + (0.6, ResultStatusApp.potential_detection), + (0.71, ResultStatusApp.detected), + (1, ResultStatusApp.detected), + ], + ) + def test_status( + classifier: DataDriftClassifier, value: float, expected_status: ResultStatusApp + ) -> None: + assert ( + classifier.value_to_status(value) == expected_status + ), "The status is different than expected" + + +class TestApplication: + COUNT = 12 # the sample df size + + @classmethod + @pytest.fixture + def sample_df_stats(cls) -> mlrun.common.model_monitoring.helpers.FeatureStats: + return mlrun.common.model_monitoring.helpers.FeatureStats( + { + "timestamp": { + "count": cls.COUNT, + "25%": "2024-03-11 09:31:39.152301+00:00", + "50%": "2024-03-11 09:31:39.152301+00:00", + "75%": "2024-03-11 09:31:39.152301+00:00", + "max": "2024-03-11 09:31:39.152301+00:00", + "mean": "2024-03-11 09:31:39.152301+00:00", + "min": "2024-03-11 09:31:39.152301+00:00", + }, + "ticker": { + "count": cls.COUNT, + "unique": 1, + "top": "AAPL", + "freq": cls.COUNT, + }, + "f1": { + "count": cls.COUNT, + "hist": [[2, 3, 0, 3, 1, 3], [-10, -5, 0, 5, 10, 15, 20]], + }, + "f2": { + "count": cls.COUNT, + "hist": [[0, 6, 0, 2, 1, 3], [66, 67, 68, 69, 70, 71, 72]], + }, + "l": { + "count": cls.COUNT, + "hist": [ + [10, 0, 0, 0, 0, 2], + [0.0, 0.16, 0.33, 0.5, 0.67, 0.83, 1.0], + ], + }, + } + ) + + @staticmethod + @pytest.fixture + def feature_stats() -> mlrun.common.model_monitoring.helpers.FeatureStats: + return mlrun.common.model_monitoring.helpers.FeatureStats( + { + "f1": { + "count": 100, + "hist": [[0, 0, 0, 30, 70, 0], [-10, -5, 0, 5, 10, 15, 20]], + }, + "f2": { + "count": 100, + "hist": [[0, 45, 5, 15, 35, 0], [66, 67, 68, 69, 70, 71, 72]], + }, + "l": { + "count": 100, + "hist": [ + [30, 0, 0, 0, 0, 70], + [0.0, 0.16, 0.33, 0.5, 0.67, 0.83, 1.0], + ], + }, + } + ) + + @staticmethod + @pytest.fixture + def monitoring_context( + sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats, + feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats, + application: HistogramDataDriftApplication, + logger: mlrun.utils.Logger, + project: mlrun.MlrunProject, + ) -> mm_context.MonitoringApplicationContext: + monitoring_context = mm_context.MonitoringApplicationContext( + application_name=application.NAME, + event={}, + artifacts_logger=project, + logger=logger, + project=project, + nuclio_logger=logger, # the wrong type but works here + ) + monitoring_context._sample_df_stats = sample_df_stats + monitoring_context._feature_stats = feature_stats + + return monitoring_context + + @classmethod + def test( + cls, + application: HistogramDataDriftApplication, + monitoring_context: mm_context.MonitoringApplicationContext, + project: mlrun.MlrunProject, + ) -> None: + results = application.do_tracking(monitoring_context) + metrics = [] + assert len(results) == 6, "Expected four results & metrics % stats" + for res in results: + if isinstance( + res, + mlrun.model_monitoring.applications.ModelMonitoringApplicationResult, + ): + assert ( + res.kind == ResultKindApp.data_drift + ), "The kind should be data drift" + assert ( + res.name == "general_drift" + ), "The result name should be general_drift" + assert ( + res.status == ResultStatusApp.potential_detection + ), "Expected potential detection in the general drift" + elif isinstance( + res, + mlrun.model_monitoring.applications.ModelMonitoringApplicationMetric, + ): + metrics.append(res) + assert len(metrics) == 3, "Expected three metrics" + + # Check the artifacts + assert project._artifact_manager.artifact_uris.keys() == { + "features_drift_results", + "drift_table_plot", + }, "The artifacts in the artifact manager are different than expected" + assert {f.name for f in Path(project.artifact_path).glob("*")} == { + "drift_table_plot.html", + "features_drift_results.json", + }, "The artifact files were not found or are different than expected" + + +class TestMetricsPerFeature: + @staticmethod + @pytest.fixture + def monitoring_context( + logger: mlrun.utils.Logger, + ) -> mm_context.MonitoringApplicationContext: + ctx = Mock() + + def dict_to_histogram(df: pd.DataFrame) -> pd.DataFrame: + return df + + ctx.dict_to_histogram = dict_to_histogram + ctx.logger = logger + return ctx + + @staticmethod + @pytest.mark.parametrize( + ("sample_df_stats", "feature_stats"), + [ + pytest.param(pd.DataFrame(), pd.DataFrame(), id="empty-dfs"), + pytest.param( + pd.read_csv(assets_folder / "sample_df_stats.csv", index_col=0), + pd.read_csv(assets_folder / "feature_stats.csv", index_col=0), + id="real-world-csv-dfs", + ), + ], + ) + def test_compute_metrics_per_feature( + application: HistogramDataDriftApplication, + monitoring_context: Mock, + sample_df_stats: pd.DataFrame, + feature_stats: pd.DataFrame, + ) -> None: + monitoring_context.sample_df_stats = sample_df_stats + monitoring_context.feature_stats = feature_stats + + metrics_per_feature = application._compute_metrics_per_feature( + monitoring_context=monitoring_context + ) + assert set(metrics_per_feature.columns) == { + metric.NAME for metric in application.metrics + }, "Different metrics than expected" + assert set(metrics_per_feature.index) == set( + feature_stats.columns + ), "The features are different than expected" diff --git a/requirements.txt b/requirements.txt index e58ca8e98..c393fd552 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ wheel bs4 mlrun>=1.0.0 jinja2~=3.1.2 +click>=8.0 pipenv myst_nb black>=24.3.0