From 72d5485d37f4503f77c6ac4e093e70bd4bd248d9 Mon Sep 17 00:00:00 2001 From: Daniel Perez <100069700+danielperezz@users.noreply.github.com> Date: Sun, 28 Sep 2025 15:43:07 +0300 Subject: [PATCH 01/17] replace author to Iguazio manually (#905) --- functions/src/aggregate/item.yaml | 2 +- functions/src/arc_to_parquet/item.yaml | 2 +- functions/src/auto_trainer/item.yaml | 2 +- functions/src/azureml_serving/function.yaml | 2 +- functions/src/azureml_serving/item.yaml | 2 +- functions/src/azureml_utils/item.yaml | 2 +- functions/src/batch_inference/item.yaml | 2 +- functions/src/batch_inference_v2/item.yaml | 2 +- functions/src/describe/item.yaml | 2 +- functions/src/describe_spark/item.yaml | 3 ++- functions/src/feature_selection/item.yaml | 2 +- functions/src/gen_class_data/item.yaml | 2 +- functions/src/github_utils/function.yaml | 2 +- functions/src/github_utils/item.yaml | 2 +- functions/src/hugging_face_serving/item.yaml | 2 +- functions/src/load_dataset/function.yaml | 2 +- functions/src/load_dataset/item.yaml | 2 +- functions/src/mlflow_utils/item.yaml | 2 +- functions/src/model_server/item.yaml | 2 +- functions/src/model_server_tester/function.yaml | 2 +- functions/src/model_server_tester/item.yaml | 2 +- functions/src/noise_reduction/item.yaml | 2 +- functions/src/onnx_utils/item.yaml | 2 +- functions/src/open_archive/item.yaml | 2 +- functions/src/pii_recognizer/item.yaml | 2 +- functions/src/pyannote_audio/item.yaml | 2 +- functions/src/question_answering/item.yaml | 2 +- functions/src/send_email/function.yaml | 2 +- functions/src/send_email/item.yaml | 2 +- functions/src/silero_vad/item.yaml | 2 +- functions/src/sklearn_classifier/item.yaml | 2 +- functions/src/sklearn_classifier_dask/function.yaml | 2 +- functions/src/sklearn_classifier_dask/item.yaml | 2 +- functions/src/structured_data_generator/item.yaml | 2 +- functions/src/test_classifier/function.yaml | 2 +- functions/src/test_classifier/item.yaml | 2 +- functions/src/text_to_audio_generator/item.yaml | 2 +- functions/src/tf2_serving/function.yaml | 2 +- functions/src/tf2_serving/item.yaml | 2 +- functions/src/transcribe/item.yaml | 2 +- functions/src/translate/item.yaml | 2 +- functions/src/v2_model_server/function.yaml | 2 +- functions/src/v2_model_server/item.yaml | 2 +- functions/src/v2_model_tester/function.yaml | 2 +- functions/src/v2_model_tester/item.yaml | 2 +- modules/src/count_events/item.yaml | 2 +- 46 files changed, 47 insertions(+), 46 deletions(-) diff --git a/functions/src/aggregate/item.yaml b/functions/src/aggregate/item.yaml index 75f7e74c5..43e87a4a2 100644 --- a/functions/src/aggregate/item.yaml +++ b/functions/src/aggregate/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: avia + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/arc_to_parquet/item.yaml b/functions/src/arc_to_parquet/item.yaml index 4bc2634ce..fe2925aef 100644 --- a/functions/src/arc_to_parquet/item.yaml +++ b/functions/src/arc_to_parquet/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: avi + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/auto_trainer/item.yaml b/functions/src/auto_trainer/item.yaml index 7e622db29..ba33f6a08 100755 --- a/functions/src/auto_trainer/item.yaml +++ b/functions/src/auto_trainer/item.yaml @@ -10,7 +10,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yonish + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/azureml_serving/function.yaml b/functions/src/azureml_serving/function.yaml index b2242da1d..978806878 100644 --- a/functions/src/azureml_serving/function.yaml +++ b/functions/src/azureml_serving/function.yaml @@ -5,7 +5,7 @@ metadata: hash: c0f404820b8f0fe92d2d1cfe9dbcc068be1a13bf project: '' labels: - author: yonish + author: Iguazio categories: - machine-learning - model-serving diff --git a/functions/src/azureml_serving/item.yaml b/functions/src/azureml_serving/item.yaml index d20e636b0..93fb046b2 100644 --- a/functions/src/azureml_serving/item.yaml +++ b/functions/src/azureml_serving/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yonish + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/functions/src/azureml_utils/item.yaml b/functions/src/azureml_utils/item.yaml index 342307643..ae33ad5b1 100644 --- a/functions/src/azureml_utils/item.yaml +++ b/functions/src/azureml_utils/item.yaml @@ -10,7 +10,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yonish + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/batch_inference/item.yaml b/functions/src/batch_inference/item.yaml index 16a56cfe7..65b61431e 100644 --- a/functions/src/batch_inference/item.yaml +++ b/functions/src/batch_inference/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: guyl + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/batch_inference_v2/item.yaml b/functions/src/batch_inference_v2/item.yaml index 775579b9e..8b8f01df0 100644 --- a/functions/src/batch_inference_v2/item.yaml +++ b/functions/src/batch_inference_v2/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-08-07:12-25 hidden: false icon: '' labels: - author: eyald + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0-rc51 diff --git a/functions/src/describe/item.yaml b/functions/src/describe/item.yaml index 2c41a025f..da26f1501 100644 --- a/functions/src/describe/item.yaml +++ b/functions/src/describe/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: Davids + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/describe_spark/item.yaml b/functions/src/describe_spark/item.yaml index 6c4ad32d9..58e267d4a 100644 --- a/functions/src/describe_spark/item.yaml +++ b/functions/src/describe_spark/item.yaml @@ -7,7 +7,8 @@ example: describe_spark.ipynb generationDate: 2022-08-28:17-25 hidden: false icon: '' -labels: {} +labels: + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/functions/src/feature_selection/item.yaml b/functions/src/feature_selection/item.yaml index 5356024df..4f9a3a5dd 100644 --- a/functions/src/feature_selection/item.yaml +++ b/functions/src/feature_selection/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: orz + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.8.0-rc40 diff --git a/functions/src/gen_class_data/item.yaml b/functions/src/gen_class_data/item.yaml index a6dd94b61..30f5cd21c 100644 --- a/functions/src/gen_class_data/item.yaml +++ b/functions/src/gen_class_data/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: Daniel + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/github_utils/function.yaml b/functions/src/github_utils/function.yaml index fe60cff7a..2d5d93aab 100644 --- a/functions/src/github_utils/function.yaml +++ b/functions/src/github_utils/function.yaml @@ -5,7 +5,7 @@ metadata: hash: d8e639af306794ce6f59eb246f0b845c016c9da4 project: '' labels: - author: yaronh + author: Iguazio categories: - utils spec: diff --git a/functions/src/github_utils/item.yaml b/functions/src/github_utils/item.yaml index c00bf86b2..9c06d84a7 100644 --- a/functions/src/github_utils/item.yaml +++ b/functions/src/github_utils/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/functions/src/hugging_face_serving/item.yaml b/functions/src/hugging_face_serving/item.yaml index 48b063e49..edad986be 100644 --- a/functions/src/hugging_face_serving/item.yaml +++ b/functions/src/hugging_face_serving/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-09-05:17-00 hidden: false icon: '' labels: - author: yonish + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/functions/src/load_dataset/function.yaml b/functions/src/load_dataset/function.yaml index 046bb5cc4..91775a802 100644 --- a/functions/src/load_dataset/function.yaml +++ b/functions/src/load_dataset/function.yaml @@ -5,7 +5,7 @@ metadata: hash: d05aa41d618533335eeaeab38aa434a14e3e3980 project: '' labels: - author: yjb + author: Iguazio framework: sklearn categories: - data-preparation diff --git a/functions/src/load_dataset/item.yaml b/functions/src/load_dataset/item.yaml index d9fcf8d61..fb6f69c40 100644 --- a/functions/src/load_dataset/item.yaml +++ b/functions/src/load_dataset/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yjb + author: Iguazio framework: sklearn maintainers: [] marketplaceType: '' diff --git a/functions/src/mlflow_utils/item.yaml b/functions/src/mlflow_utils/item.yaml index 79304eb38..176a9dd95 100644 --- a/functions/src/mlflow_utils/item.yaml +++ b/functions/src/mlflow_utils/item.yaml @@ -9,7 +9,7 @@ generationDate: 2024-05-23:12-00 hidden: false icon: '' labels: - author: zeevr + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.8.0 diff --git a/functions/src/model_server/item.yaml b/functions/src/model_server/item.yaml index c85cf163d..65c6f09e7 100644 --- a/functions/src/model_server/item.yaml +++ b/functions/src/model_server/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio framework: sklearn maintainers: [] marketplaceType: '' diff --git a/functions/src/model_server_tester/function.yaml b/functions/src/model_server_tester/function.yaml index eda10459e..45934c444 100644 --- a/functions/src/model_server_tester/function.yaml +++ b/functions/src/model_server_tester/function.yaml @@ -5,7 +5,7 @@ metadata: hash: 3b203a2799e44992539eafd32a4b8979bbcc8001 project: '' labels: - author: yaronh + author: Iguazio categories: - monitoring - model-serving diff --git a/functions/src/model_server_tester/item.yaml b/functions/src/model_server_tester/item.yaml index 3e43a9297..b18e0082c 100644 --- a/functions/src/model_server_tester/item.yaml +++ b/functions/src/model_server_tester/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/functions/src/noise_reduction/item.yaml b/functions/src/noise_reduction/item.yaml index f748d5587..d8f2cddd4 100644 --- a/functions/src/noise_reduction/item.yaml +++ b/functions/src/noise_reduction/item.yaml @@ -9,7 +9,7 @@ generationDate: 2024-03-04:17-30 hidden: false icon: '' labels: - author: yonatans + author: Iguazio maintainers: [] mlrunVersion: 1.7.0 name: noise-reduction diff --git a/functions/src/onnx_utils/item.yaml b/functions/src/onnx_utils/item.yaml index 02134f32d..81ad593d5 100644 --- a/functions/src/onnx_utils/item.yaml +++ b/functions/src/onnx_utils/item.yaml @@ -10,7 +10,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: guyl + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.2 diff --git a/functions/src/open_archive/item.yaml b/functions/src/open_archive/item.yaml index 0a2f4516c..c40a62e4a 100644 --- a/functions/src/open_archive/item.yaml +++ b/functions/src/open_archive/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.8.0-rc50 diff --git a/functions/src/pii_recognizer/item.yaml b/functions/src/pii_recognizer/item.yaml index 8f3185b4c..dcd71c85c 100644 --- a/functions/src/pii_recognizer/item.yaml +++ b/functions/src/pii_recognizer/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-08-15:10-24 hidden: false icon: '' labels: - author: pgw + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/pyannote_audio/item.yaml b/functions/src/pyannote_audio/item.yaml index b6dbccddb..79a5a0f1b 100644 --- a/functions/src/pyannote_audio/item.yaml +++ b/functions/src/pyannote_audio/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-12-03:14-30 hidden: false icon: '' labels: - author: guyl + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/question_answering/item.yaml b/functions/src/question_answering/item.yaml index 741bab80c..b307a9877 100755 --- a/functions/src/question_answering/item.yaml +++ b/functions/src/question_answering/item.yaml @@ -8,7 +8,7 @@ generationDate: 2023-08-07:11-30 hidden: false icon: '' labels: - author: yonish + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/send_email/function.yaml b/functions/src/send_email/function.yaml index e895cddc9..1722fb586 100644 --- a/functions/src/send_email/function.yaml +++ b/functions/src/send_email/function.yaml @@ -5,7 +5,7 @@ metadata: hash: 5c4528084ea98992b77f65e29359bbcb4a0df8ab project: '' labels: - author: saarc + author: Iguazio categories: - utils spec: diff --git a/functions/src/send_email/item.yaml b/functions/src/send_email/item.yaml index 4c42cb73b..6caf1ab50 100644 --- a/functions/src/send_email/item.yaml +++ b/functions/src/send_email/item.yaml @@ -8,7 +8,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: saarc + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.4.1 diff --git a/functions/src/silero_vad/item.yaml b/functions/src/silero_vad/item.yaml index 49adfcd9f..7a1aeaee2 100644 --- a/functions/src/silero_vad/item.yaml +++ b/functions/src/silero_vad/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-12-03:14-30 hidden: false icon: '' labels: - author: guyl + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/sklearn_classifier/item.yaml b/functions/src/sklearn_classifier/item.yaml index 1b41e630a..b9726fb79 100644 --- a/functions/src/sklearn_classifier/item.yaml +++ b/functions/src/sklearn_classifier/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: true icon: '' labels: - author: yjb + author: Iguazio framework: sklearn maintainers: [] marketplaceType: '' diff --git a/functions/src/sklearn_classifier_dask/function.yaml b/functions/src/sklearn_classifier_dask/function.yaml index 98be06b8c..46f733886 100644 --- a/functions/src/sklearn_classifier_dask/function.yaml +++ b/functions/src/sklearn_classifier_dask/function.yaml @@ -5,7 +5,7 @@ metadata: hash: e542038fbb84f790b7144b529665f36d70d80906 project: '' labels: - author: yjb + author: Iguazio framework: sklearn categories: - machine-learning diff --git a/functions/src/sklearn_classifier_dask/item.yaml b/functions/src/sklearn_classifier_dask/item.yaml index 35e89b2dd..3264ec681 100644 --- a/functions/src/sklearn_classifier_dask/item.yaml +++ b/functions/src/sklearn_classifier_dask/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: true icon: '' labels: - author: yjb + author: Iguazio framework: sklearn maintainers: [] marketplaceType: '' diff --git a/functions/src/structured_data_generator/item.yaml b/functions/src/structured_data_generator/item.yaml index 6e01aefb9..f268f05e6 100755 --- a/functions/src/structured_data_generator/item.yaml +++ b/functions/src/structured_data_generator/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-12-14:10-50 hidden: false icon: '' labels: - author: zeevr + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.8.0 diff --git a/functions/src/test_classifier/function.yaml b/functions/src/test_classifier/function.yaml index d0e1b6067..f35446b51 100644 --- a/functions/src/test_classifier/function.yaml +++ b/functions/src/test_classifier/function.yaml @@ -5,7 +5,7 @@ metadata: hash: b4d447a2328975e90a0dbc7a28f82009924cc157 project: '' labels: - author: yjb + author: Iguazio framework: sklearn categories: - machine-learning diff --git a/functions/src/test_classifier/item.yaml b/functions/src/test_classifier/item.yaml index e9f4982a9..a38497a73 100644 --- a/functions/src/test_classifier/item.yaml +++ b/functions/src/test_classifier/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: true icon: '' labels: - author: yjb + author: Iguazio framework: sklearn maintainers: [] marketplaceType: '' diff --git a/functions/src/text_to_audio_generator/item.yaml b/functions/src/text_to_audio_generator/item.yaml index ff9ec379f..13beef4b9 100644 --- a/functions/src/text_to_audio_generator/item.yaml +++ b/functions/src/text_to_audio_generator/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-12-03:15-30 hidden: false icon: '' labels: - author: yonatans + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.1 diff --git a/functions/src/tf2_serving/function.yaml b/functions/src/tf2_serving/function.yaml index c755263ae..17cf2fbb9 100644 --- a/functions/src/tf2_serving/function.yaml +++ b/functions/src/tf2_serving/function.yaml @@ -4,7 +4,7 @@ metadata: hash: 134293b94996e74275d90546f8d4ef96198af679 project: '' labels: - author: yaronh + author: Iguazio categories: - model-serving - machine-learning diff --git a/functions/src/tf2_serving/item.yaml b/functions/src/tf2_serving/item.yaml index 88dac8478..d7c793364 100644 --- a/functions/src/tf2_serving/item.yaml +++ b/functions/src/tf2_serving/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/functions/src/transcribe/item.yaml b/functions/src/transcribe/item.yaml index 6deaf710a..0bc9e5d0f 100644 --- a/functions/src/transcribe/item.yaml +++ b/functions/src/transcribe/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-07-13:11-20 hidden: false icon: '' labels: - author: yonatans + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/translate/item.yaml b/functions/src/translate/item.yaml index 839d1efaa..eb0e821e4 100644 --- a/functions/src/translate/item.yaml +++ b/functions/src/translate/item.yaml @@ -9,7 +9,7 @@ generationDate: 2023-12-05:17-20 hidden: false icon: '' labels: - author: guyl + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.7.0 diff --git a/functions/src/v2_model_server/function.yaml b/functions/src/v2_model_server/function.yaml index 45d261b6a..5ecfec9ba 100644 --- a/functions/src/v2_model_server/function.yaml +++ b/functions/src/v2_model_server/function.yaml @@ -5,7 +5,7 @@ metadata: hash: ad85919d3b9cf2acae43a3434ba56e01b005755e project: '' labels: - author: yaronh + author: Iguazio framework: sklearn categories: - model-serving diff --git a/functions/src/v2_model_server/item.yaml b/functions/src/v2_model_server/item.yaml index 7bde91a64..4beda6243 100644 --- a/functions/src/v2_model_server/item.yaml +++ b/functions/src/v2_model_server/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio framework: sklearn maintainers: [] marketplaceType: '' diff --git a/functions/src/v2_model_tester/function.yaml b/functions/src/v2_model_tester/function.yaml index 518bd1492..c9562b097 100644 --- a/functions/src/v2_model_tester/function.yaml +++ b/functions/src/v2_model_tester/function.yaml @@ -5,7 +5,7 @@ metadata: hash: 72d3f664ff2aa870109e44f52f975bda2ac13682 project: '' labels: - author: yaronh + author: Iguazio categories: - model-testing - machine-learning diff --git a/functions/src/v2_model_tester/item.yaml b/functions/src/v2_model_tester/item.yaml index ce1ecef5f..c3412fc5c 100644 --- a/functions/src/v2_model_tester/item.yaml +++ b/functions/src/v2_model_tester/item.yaml @@ -9,7 +9,7 @@ generationDate: 2022-08-28:17-25 hidden: false icon: '' labels: - author: yaronh + author: Iguazio maintainers: [] marketplaceType: '' mlrunVersion: 1.1.0 diff --git a/modules/src/count_events/item.yaml b/modules/src/count_events/item.yaml index e0eb09069..e5d796b62 100644 --- a/modules/src/count_events/item.yaml +++ b/modules/src/count_events/item.yaml @@ -6,7 +6,7 @@ example: count_events.ipynb generationDate: 2025-09-16:12-25 hidden: false labels: - author: iguazio + author: Iguazio mlrunVersion: 1.10.0-rc27 name: count_events spec: From bbcf638477567e762ab258b0127efa36135c834b Mon Sep 17 00:00:00 2001 From: Daniel Perez <100069700+danielperezz@users.noreply.github.com> Date: Sun, 5 Oct 2025 12:45:51 +0300 Subject: [PATCH 02/17] Organize CLI directory + new CLI for generating item.yaml files (#906) * create a CLI for generating item.yaml and organize the CLI directory * modify comments to module * PR fixes * Update cli/common/generate_item_yaml.py Co-authored-by: Eyal Danieli --------- Co-authored-by: Eyal Danieli --- cli/README.md | 66 +++++++++++++++++++++++ cli/cli.py | 7 +-- cli/common/generate_item_yaml.py | 55 +++++++++++++++++++ cli/common/item_yaml.py | 54 ------------------- cli/functions/new_function_item.py | 67 ------------------------ cli/utils/function_item_template.yaml.j2 | 22 ++++++++ cli/utils/item_template.yaml | 21 -------- cli/utils/module_item_template.yaml.j2 | 16 ++++++ requirements.txt | 1 + 9 files changed, 162 insertions(+), 147 deletions(-) create mode 100644 cli/README.md create mode 100644 cli/common/generate_item_yaml.py delete mode 100644 cli/common/item_yaml.py delete mode 100644 cli/functions/new_function_item.py create mode 100644 cli/utils/function_item_template.yaml.j2 delete mode 100644 cli/utils/item_template.yaml create mode 100644 cli/utils/module_item_template.yaml.j2 diff --git a/cli/README.md b/cli/README.md new file mode 100644 index 000000000..4a3cd3bfc --- /dev/null +++ b/cli/README.md @@ -0,0 +1,66 @@ +## Available Commands +(Explore more advanced options in the code, this is basic usage demonstration) + +### generate-item-yaml +Generate an `item.yaml` file (basic draft) in the appropriate directory from a Jinja2 template + +Usage: + `python -m cli.cli generate-item-yaml TYPE NAME` + +Example: + `python -m cli.cli generate-item-yaml function aggregate` + +--- + +### item-to-function +Creates a `function.yaml` file based on a provided `item.yaml` file. + +Usage: + `python -m cli.cli item-to-function --item-path PATH` + +Example: + `python -m cli.cli item-to-function --item-path functions/src/aggregate` + +--- + +### function-to-item +Creates a `item.yaml` file based on a provided `function.yaml` file. + +Usage: + `python -m cli.cli function-to-item PATH` + +Example: + `python -m cli.cli function-to-item --path functions/src/aggregate` + +--- + +### run-tests +Run assets test suite. + +Usage: + `python -m cli.cli run-tests -r PATH -s TYPE -fn NAME` + +Example: + `python -m cli.cli run-tests -r functions/src/aggregate -s py -fn aggregate` + +--- + +### build-marketplace +Build and push (create a PR) the updated marketplace/ directory (e.g: marketplace/functions) + +Usage: + `python -m cli.cli build-marketplace -s SOURCE-DIR -sn TYPE -m MARKETPLACE-DIR -c CHANNEL -v -f` + +Example: + `python -m cli.cli build-marketplace -s ./functions/src -sn functions -m marketplace -c master -v -f` + +--- + +### update-readme +Regenerate the `README.md` files in each of the asset directories (functions/modules). + +Usage: + `python -m cli.cli update-readme --asset TYPE` + +Example: + `python -m cli.cli update-readme --asset functions --asset modules` \ No newline at end of file diff --git a/cli/cli.py b/cli/cli.py index 8fee9891a..e8e6922fe 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -17,22 +17,19 @@ from cli.functions.function_to_item import function_to_item_cli from cli.functions.item_to_function import item_to_function_cli from cli.marketplace.build import build_marketplace_cli -from cli.functions.new_function_item import new_item as new_function_item from cli.common.test_suite import test_suite -from cli.common.item_yaml import update_functions_yaml from cli.common.update_readme import update_readme +from cli.common.generate_item_yaml import generate_item_yaml @click.group() def cli(): pass - -cli.add_command(new_function_item) +cli.add_command(generate_item_yaml, name="generate-item-yaml") cli.add_command(item_to_function_cli, name="item-to-function") cli.add_command(function_to_item_cli, name="function-to-item") cli.add_command(test_suite, name="run-tests") cli.add_command(build_marketplace_cli, name="build-marketplace") -cli.add_command(update_functions_yaml, name="update-functions-yaml") cli.add_command(update_readme, name="update-readme") if __name__ == "__main__": diff --git a/cli/common/generate_item_yaml.py b/cli/common/generate_item_yaml.py new file mode 100644 index 000000000..9ce362c37 --- /dev/null +++ b/cli/common/generate_item_yaml.py @@ -0,0 +1,55 @@ +import sys +from pathlib import Path +from datetime import datetime +import click +from jinja2 import Environment, FileSystemLoader + +TEMPLATES = { + "function": "cli/utils/function_item_template.yaml.j2", + "module": "cli/utils/module_item_template.yaml.j2", +} + + +@click.command() +@click.argument("type", type=click.Choice(list(TEMPLATES.keys()))) +@click.argument("name") +@click.option("--overwrite", is_flag=True, help="Replace existing file instead of raising an error.") +def generate_item_yaml(type: str, name: str, overwrite: bool = False): + """ + Generate an item.yaml file from a template. + +type: one of the supported types (currently only `function` or `module`) +name: the function/module name (also used as the directory name) +overwrite: whether to overwrite existing item.yaml file + """ + # Construct the target path + path = Path(f"{type}s/src/{name}").resolve() + output_file = path / "item.yaml" + + if not overwrite and output_file.exists(): + click.echo(f"Error: {output_file} already exists.", err=True) + sys.exit(1) + + if not path.exists(): + click.echo(f"Error: {path} does not exist.", err=True) + sys.exit(1) + + # Render parameters + params = { + "example": f"{name}.ipynb", + "generationDate": datetime.utcnow().strftime("%Y-%m-%d"), + "name": name, + "filename": f"{name}.py", + } + + # Load and render template + env = Environment(loader=FileSystemLoader(".")) + template = env.get_template(TEMPLATES[type]) + rendered = template.render(params) + + output_file.write_text(rendered) + click.echo(f"Created {output_file}") + + +if __name__ == "__main__": + generate_item_yaml() \ No newline at end of file diff --git a/cli/common/item_yaml.py b/cli/common/item_yaml.py deleted file mode 100644 index a14ea48c2..000000000 --- a/cli/common/item_yaml.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import click -from cli.utils.path_iterator import PathIterator -from cli.utils.helpers import is_item_dir -import yaml -import datetime - - -@click.command() -@click.option("-r", "--root-directory", default=".", help="Path to root directory") -@click.option("-v", "--version", help="update version number in function item yaml") -@click.option("-mv", "--mlrun-version", help="update mlrun version in function item.yaml") -@click.option("-p", "--platform-version", help="update platform version in function item.yaml") -@click.option("-d", "--date-time", help="update date-time in function item.yaml") -def update_functions_yaml(root_directory: str, - version: str, - mlrun_version: str, - platform_version: str, - date_time: str): - if not root_directory: - click.echo("-r/--root-directory is required") - exit(1) - - item_iterator = PathIterator(root=root_directory, rule=is_item_dir, as_path=True) - for inner_dir in item_iterator: - item_yaml = "item.yaml" - if (inner_dir / item_yaml).exists(): - path = str(inner_dir)+"/"+item_yaml - stream = open(path, 'r') - data = yaml.load(stream=stream, Loader=yaml.FullLoader) - if version: - data['version'] = version - if mlrun_version: - data['mlrunVersion'] = mlrun_version - if platform_version: - data['platformVersion'] = platform_version - if date_time: - data['generationDate'] = datetime.datetime.now().strftime('%Y-%m-%d:%H-%M') - print(data) - with open(path, 'w') as yaml_file: - yaml_file.write(yaml.dump(data, default_flow_style=False)) diff --git a/cli/functions/new_function_item.py b/cli/functions/new_function_item.py deleted file mode 100644 index 70eb30d55..000000000 --- a/cli/functions/new_function_item.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2019 Iguazio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from datetime import datetime -from pathlib import Path - -import click - - -@click.command() -@click.option( - "-p", "--path", help="Path to directory in which a new item.yaml will be created" -) -@click.option("-o", "--override", is_flag=True, help="Override if already exists") -def new_item(path: str, override: bool): - path = Path(path) / "item.yaml" - - if not path.parent.exists(): - path.parent.mkdir(parents=True) - elif path.exists() and not override: - click.echo( - f"{path / 'item.yaml'} already exists, set [-o, --override] to override" - ) - exit(1) - - with open(path, "w") as f: - f.write( - f""" -apiVersion: v1 -categories: [] # List of category names -description: '' # Short description -doc: '' # Path to README.md if exists -example: '' # Path to examole notebook -generationDate: {str(datetime.utcnow())} -icon: '' # Path to icon file -labels: {{}} # Key values label pairs -maintainers: [] # List of maintainers -mlrunVersion: '' # Function’s MLRun version requirement, should follow python’s versioning schema -name: '' # Function name -platformVersion: '' # Function’s Iguazio version requirement, should follow python’s versioning schema -spec: - filename: '' # Implementation file - handler: '' # Handler function name - image: '' # Base image name - kind: '' # Function kind - requirements: [] # List of Pythonic library requirements - customFields: {{}} # Custom spec fields - env: [] # Spec environment params -url: '' -version: 0.0.1 # Function version, should follow standard semantic versioning schema -""" - ) - - -if __name__ == "__main__": - new_item() diff --git a/cli/utils/function_item_template.yaml.j2 b/cli/utils/function_item_template.yaml.j2 new file mode 100644 index 000000000..da35ef819 --- /dev/null +++ b/cli/utils/function_item_template.yaml.j2 @@ -0,0 +1,22 @@ +apiVersion: v1 +categories: [] {# List of category names #} +description: '' {# Short description #} +doc: '' {# Path to README.md if exists #} +example: {{ example|default('') }} {# Path to example notebook #} +generationDate: {{ generationDate|default('') }} {# Automatically generated ISO8086 datetime #} +hidden: false {# Hide function from the UI #} +icon: '' {# Path to icon file #} +labels: {# Key values label pairs #} + author: Iguazio +maintainers: [] {# List of maintainers #} +mlrunVersion: '' {# Function’s MLRun version requirement, should follow python’s versioning schema #} +name: {{ name|default('') }} {# Function name #} +platformVersion: '' {# Function’s Iguazio version requirement, should follow python’s versioning schema #} +spec: + filename: {{ filename|default('') }} {# Implementation file #} + handler: '' {# Handler function name #} + image: mlrun/mlrun {# Base image name #} + kind: '' {# Function kind #} + requirements: [] {# List of Pythonic library requirements #} +url: '' +version: 1.0.0 {# Function version, should follow standard semantic versioning schema #} \ No newline at end of file diff --git a/cli/utils/item_template.yaml b/cli/utils/item_template.yaml deleted file mode 100644 index b1d38d334..000000000 --- a/cli/utils/item_template.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: v1 -categories: [] # List of category names -description: '' # Short description -doc: '' # Path to README.md if exists -example: '' # Path to examole notebook -generationDate: '' # Automatically generated ISO8086 datetime -hidden: false # Hide function from the UI -icon: '' # Path to icon file -labels: {} # Key values label pairs -maintainers: [] # List of maintainers -mlrunVersion: '' # Function’s MLRun version requirement, should follow python’s versioning schema -name: '' # Function name -platformVersion: '' # Function’s Iguazio version requirement, should follow python’s versioning schema -spec: - filename: '' # Implementation file - handler: '' # Handler function name - image: '' # Base image name - kind: '' # Function kind - requirements: [] # List of Pythonic library requirements -url: '' # ??? -version: '' # Function version, should follow standard semantic versioning schema \ No newline at end of file diff --git a/cli/utils/module_item_template.yaml.j2 b/cli/utils/module_item_template.yaml.j2 new file mode 100644 index 000000000..539cd6f0a --- /dev/null +++ b/cli/utils/module_item_template.yaml.j2 @@ -0,0 +1,16 @@ +apiVersion: v1 +categories: [] {# List of category names #} +description: '' {# Short description #} +example: {{ example|default('') }} {# Path to example notebook #} +generationDate: {{ generationDate|default('') }} {# Automatically generated ISO8086 datetime #} +hidden: false {# Hide Module from the UI #} +labels: + author: Iguazio +mlrunVersion: '' {# Module’s MLRun version requirement, should follow python’s versioning schema #} +name: {{ name|default('') }} {# Module name #} +spec: + filename: {{ filename|default('') }} {# Implementation file #} + image: mlrun/mlrun {# Base image name #} + kind: '' {# Module kind #} + requirements: [] {# List of Pythonic library requirements #} +version: 1.0.0 {# Module version, should follow standard semantic versioning schema #} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e58ca8e98..c393fd552 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ wheel bs4 mlrun>=1.0.0 jinja2~=3.1.2 +click>=8.0 pipenv myst_nb black>=24.3.0 From 73b4423da22ef97b0725cf88fb190578c4555ba6 Mon Sep 17 00:00:00 2001 From: Eyal Danieli Date: Wed, 5 Nov 2025 18:52:55 +0200 Subject: [PATCH 03/17] fill count events notebook (#908) --- modules/src/count_events/count_events.ipynb | 812 +++++++++++++++++++- modules/src/count_events/count_events.py | 13 +- modules/src/count_events/item.yaml | 4 +- modules/src/count_events/requirements.txt | 4 +- 4 files changed, 819 insertions(+), 14 deletions(-) diff --git a/modules/src/count_events/count_events.ipynb b/modules/src/count_events/count_events.ipynb index 54f657bb0..8a3cac849 100644 --- a/modules/src/count_events/count_events.ipynb +++ b/modules/src/count_events/count_events.ipynb @@ -1,35 +1,829 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "2f5aea66-03d3-4ba2-a0cb-3e74e8376ff0", + "metadata": {}, + "source": [ + "# Count Events Demo" + ] + }, + { + "cell_type": "markdown", + "id": "cdadd95e-d65f-4910-b72f-ef545c09c96b", + "metadata": {}, + "source": [ + "## Overview" + ] + }, + { + "cell_type": "markdown", + "id": "c336160a-3eba-40b3-8d02-7849ca74925b", + "metadata": {}, + "source": [ + "This notebook walks through a simple example of how to monitor a real-time serving function and how to add your a custom monitoring application from the hub.\n", + "For simplicity, we’ll use the Count Events application, which calculates the number of requests in each time window.\n", + "If you’d like to create your own model monitoring application (which can later be added to the hub), follow these instructions:https://docs.mlrun.org/en/stable/model-monitoring/applications.html\n", + "\n", + "To add a model monitoring application to your project from the hub, you can choose one of two approaches:\n", + "1. **Set it directly** – the application will be deployed as is.\n", + "2. **Import it as a module** – this lets you test and modify the application code before deploying it.\n" + ] + }, + { + "cell_type": "markdown", + "id": "1bcc90b4-f3c3-46ea-8348-1e7239e4e6e0", + "metadata": {}, + "source": [ + "## Demo" + ] + }, + { + "cell_type": "markdown", + "id": "2761fb6c-2c9d-4e8c-8efd-e01762b3bb22", + "metadata": {}, + "source": [ + "### Create a project" + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "initial_id", + "execution_count": 1, + "id": "e06ac3e1-8afd-45ab-9448-f664a4e54640", "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-05 15:33:39,611 [warning] Failed resolving version info. Ignoring and using defaults\n", + "> 2025-11-05 15:33:43,049 [warning] Server or client version is unstable. Assuming compatible: {\"client_version\":\"0.0.0+unstable\",\"server_version\":\"1.11.0\"}\n", + "> 2025-11-05 15:33:58,614 [info] Created and saved project: {\"context\":\"./\",\"from_template\":null,\"name\":\"count-events-demo\",\"overwrite\":false,\"save\":true}\n", + "> 2025-11-05 15:33:58,616 [info] Project created successfully: {\"project_name\":\"count-events-demo\",\"stored_in_db\":true}\n" + ] + } + ], + "source": [ + "import mlrun\n", + "project = mlrun.get_or_create_project(\"count-events-demo\",'./')" + ] + }, + { + "cell_type": "markdown", + "id": "cb0c365d-243f-447d-a693-38007d38329a", + "metadata": {}, + "source": [ + "### Generate datastore profiles for model monitoring\n", + "Before you enable model monitoring, you must configure datastore profiles for TSDB and streaming endpoints. A datastore profile holds all the information required to address an external data source, including credentials.\n", + "Model monitoring supports Kafka and V3IO as streaming platforms, and TDEngine and V3IO as TSDB platforms.\n", + "\n", + "In this example we will use V3IO for both streaming and TSDB platforms." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "10df799e-0e63-409c-a204-551635c90410", + "metadata": {}, + "outputs": [], + "source": [ + "from mlrun.datastore.datastore_profile import (\n", + " DatastoreProfileV3io\n", + ")\n", + "\n", + "v3io_profile = DatastoreProfileV3io(name=\"v3io_profile\", v3io_access_key=mlrun.mlconf.get_v3io_access_key())\n", + "\n", + "project.register_datastore_profile(v3io_profile)\n", + "project.set_model_monitoring_credentials(stream_profile_name=v3io_profile.name, tsdb_profile_name=v3io_profile.name)" + ] + }, + { + "cell_type": "markdown", + "id": "94af15ae-b250-4583-950d-b14876065b8a", + "metadata": {}, + "source": [ + "### Deploy model monitoring infrastructure" + ] + }, + { + "cell_type": "markdown", + "id": "56b2adf8-dd65-4ee1-bf18-cd97eeb129b8", + "metadata": {}, + "source": [ + "Once you’ve provided the model monitoring credentials, you can enable monitoring capabilities for your project. \n", + "Visit MLRun's [Model Monitoring Architecture](https://docs.mlrun.org/en/stable/model-monitoring/index.html#model-monitoring-des) to read more." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a83f95bc-e6b5-4184-84cd-d3117f394b1c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2025-11-05 15:41:01 (info) Deploying function\n", + "2025-11-05 15:41:01 (info) Building\n", + "2025-11-05 15:41:01 (info) Staging files and preparing base images\n", + "2025-11-05 15:41:01 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2025-11-05 15:41:02 (info) Building processor image\n", + "2025-11-05 15:42:57 (info) Build complete\n", + "2025-11-05 15:43:07 (info) Function deploy complete\n", + "2025-11-05 15:40:57 (info) Deploying function\n", + "2025-11-05 15:40:57 (info) Building\n", + "2025-11-05 15:40:58 (info) Staging files and preparing base images\n", + "2025-11-05 15:40:58 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2025-11-05 15:40:58 (info) Building processor image\n", + "2025-11-05 15:42:53 (info) Build complete\n", + "2025-11-05 15:43:12 (info) Function deploy complete\n", + "2025-11-05 15:40:59 (info) Deploying function\n", + "2025-11-05 15:40:59 (info) Building\n", + "2025-11-05 15:40:59 (info) Staging files and preparing base images\n", + "2025-11-05 15:40:59 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2025-11-05 15:41:00 (info) Building processor image\n", + "2025-11-05 15:42:55 (info) Build complete\n", + "2025-11-05 15:43:03 (info) Function deploy complete\n" + ] + } + ], + "source": [ + "project.enable_model_monitoring(base_period=10, \n", + " deploy_histogram_data_drift_app=False, # built-in monitoring application for structured data \n", + " wait_for_deployment=True)" + ] + }, + { + "cell_type": "markdown", + "id": "e9f4186b-6f8f-479e-a603-d270397dd9ff", + "metadata": {}, + "source": [ + "### Log Models" + ] + }, + { + "cell_type": "markdown", + "id": "310fed55-3f62-4af8-800f-4fb2dccfe2fd", + "metadata": { + "tags": [] + }, + "source": [ + "We’ll generate some dummy classification models and log them to the project." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "fafcec2f-75d1-4af0-bbe0-b796367c48be", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import make_classification\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LinearRegression\n", + "import pickle\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6cabd9aa-87f2-4af7-a5c6-ea0417ceb33f", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare a model and generate training set\n", + "\n", + "X,y = make_classification(n_samples=200,n_features=5,random_state=42)\n", + "X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=42)\n", + "model = LinearRegression()\n", + "model.fit(X_train,y_train)\n", + "X_test = pd.DataFrame(X_test,columns=[f\"column_{i}\" for i in range(5)])\n", + "y_test = pd.DataFrame(y_test,columns=[\"label\"])\n", + "training_set = pd.concat([X_test,y_test],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3afde46a-9f26-4438-bedb-acad15866b03", + "metadata": {}, + "outputs": [], + "source": [ + "# Log your models\n", + "for i in range(5):\n", + " project.log_model(key=f\"model_{i}\",body=pickle.dumps(model),model_file=f'model.pkl',training_set=training_set,label_column=\"label\")" + ] + }, + { + "cell_type": "markdown", + "id": "49d820b1-9fd7-4184-9005-25d69578c995", + "metadata": {}, + "source": [ + "### Deploy Serving Function" + ] + }, + { + "cell_type": "markdown", + "id": "19fd7570-3f91-45ff-ba2b-4aebce4a95b4", + "metadata": {}, + "source": [ + "We’ll use a basic serving function and enrich it with the logged models.\n", + "\n", + "\n", + "Note that if you want to monitor a serving function along with its associated models, you must enable tracking by calling `set_tracking()`. Otherwise, the serving function’s requests won’t be monitored." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cb806c5b-a0a0-4deb-a63d-f2ea72dc3e02", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the serving\n", + "serving = mlrun.new_function('serving-model-v1',kind='serving')\n", + "graph = serving.set_topology(\"router\", engine=\"sync\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "93ee54ec-0c4a-4eb1-8bc3-d065aec64c8f", + "metadata": {}, + "outputs": [], + "source": [ + "# Apply monitoring\n", + "serving.set_tracking()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f162a254-00ce-4c8a-89df-0cf5d25da5b1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:00<00:00, 22052.07it/s]\n" + ] + } + ], + "source": [ + "# Add models to your serving\n", + "models_uri = [model.uri for model in project.list_models(tag=\"latest\")]\n", + "i=0\n", + "from tqdm import tqdm\n", + "for uri in tqdm(models_uri):\n", + " serving.add_model(key=f'model_{i}',model_path=uri,class_name='mlrun.frameworks.sklearn.SKLearnModelServer')\n", + " i+=1" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ff91f360-5c85-4bc7-a3c3-80a31f1ebd3c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-05 15:55:08,989 [info] Starting remote function deploy\n", + "2025-11-05 15:55:09 (info) Deploying function\n", + "2025-11-05 15:55:09 (info) Building\n", + "2025-11-05 15:55:09 (info) Staging files and preparing base images\n", + "2025-11-05 15:55:09 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2025-11-05 15:55:09 (info) Building processor image\n", + "2025-11-05 15:56:54 (info) Build complete\n", + "2025-11-05 15:57:06 (info) Function deploy complete\n", + "> 2025-11-05 15:57:10,181 [info] Model endpoint creation task completed with state succeeded\n", + "> 2025-11-05 15:57:10,181 [info] Successfully deployed function: {\"external_invocation_urls\":[\"count-events-demo-serving-model-v1.default-tenant.app.vmdev211.lab.iguazeng.com/\"],\"internal_invocation_urls\":[\"nuclio-count-events-demo-serving-model-v1.default-tenant.svc.cluster.local:8080\"]}\n" + ] + } + ], + "source": [ + "# Deploy serving\n", + "serving_function = project.deploy_function(serving)" + ] + }, + { + "cell_type": "markdown", + "id": "1652a010-e086-4c62-9493-1a82bc125ad4", + "metadata": {}, + "source": [ + "### Invoke Serving" + ] + }, + { + "cell_type": "markdown", + "id": "4c937193-27bc-4b6f-bc1d-cf7472045778", + "metadata": {}, + "source": [ + "Let’s generate some dummy data and invoke our serving function." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "66f469db-9f5b-4e3d-bc85-160a9c90bc8f", + "metadata": {}, "outputs": [], "source": [ - "" + "serving = project.get_function(\"serving-model-v1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "50305c3e-bd1b-4240-9c63-9851173af75e", + "metadata": {}, + "outputs": [], + "source": [ + "inputs = [[-0.51,0.051,0.6287761723991921,-0.8751269647375463,-1.0660002219502747], [-0.51,0.051,0.6287761723991921,-0.8751269647375463,-1.0660002219502747], [-0.51,0.051,0.6287761723991921,-0.8751269647375463,-1.0660002219502747], [-0.51,0.051,0.6287761723991921,-0.8751269647375463,-1.0660002219502747], [-0.51,0.051,0.6287761723991921,-0.8751269647375463,-1.0660002219502747]]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9e8372d6-4fa7-4b45-8932-1f690b55048c", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "for i in range(5):\n", + " for j in range(100):\n", + " serving.invoke(f\"/v2/models/model_{i}/infer\", {\"inputs\": inputs})" + ] + }, + { + "cell_type": "markdown", + "id": "4eeb44e1-9c1a-430a-b978-f58f1adeaa12", + "metadata": {}, + "source": [ + "# Evaluate App" + ] + }, + { + "cell_type": "markdown", + "id": "936afba8-c06b-4141-a85e-5cbc9d32aa45", + "metadata": {}, + "source": [ + "Before deploying the Count Events application, let’s first test it to make sure it works as expected. We’ll import it as a module, which downloads the module file to your local filesystem, and then run it as a job using the `evaluate` mechanism." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "213425d1-8470-483e-b325-14aaa991c8c5", + "metadata": {}, + "outputs": [], + "source": [ + "# Import count events from the hub\n", + "count_events_app = mlrun.import_module(\"hub://count_events\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d91450e4-effb-4963-b913-dcd9829e78b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-05 15:57:37,746 [info] Changing function name - adding `\"-batch\"` suffix: {\"func_name\":\"countapp-batch\"}\n", + "> 2025-11-05 15:57:37,927 [info] Storing function: {\"db\":\"http://mlrun-api:8080\",\"name\":\"countapp-batch--handler\",\"uid\":\"b7c240fd99ed4c9b940db6a587a53b80\"}\n", + "> 2025-11-05 15:57:38,202 [info] Job is running in the background, pod: countapp-batch--handler-469fm\n", + "> 2025-11-05 15:57:42,390 [info] Counted events for model endpoint window: {\"count\":4,\"end\":\"NaT\",\"model_endpoint_name\":\"model_0\",\"start\":\"NaT\"}\n", + "> 2025-11-05 15:57:42,498 [info] To track results use the CLI: {\"info_cmd\":\"mlrun get run b7c240fd99ed4c9b940db6a587a53b80 -p count-events-demo\",\"logs_cmd\":\"mlrun logs b7c240fd99ed4c9b940db6a587a53b80 -p count-events-demo\"}\n", + "> 2025-11-05 15:57:42,498 [info] Or click for UI: {\"ui_url\":\"https://dashboard.default-tenant.app.vmdev211.lab.iguazeng.com/mlprojects/count-events-demo/jobs/monitor-jobs/countapp-batch--handler/b7c240fd99ed4c9b940db6a587a53b80/overview\"}\n", + "> 2025-11-05 15:57:42,499 [info] Run execution finished: {\"name\":\"countapp-batch--handler\",\"status\":\"completed\"}\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartendstatekindnamelabelsinputsparametersresults
count-events-demo0Nov 05 15:57:412025-11-05 15:57:42.474376+00:00completedruncountapp-batch--handler
v3io_user=iguazio
kind=job
owner=iguazio
mlrun/client_version=0.0.0+unstable
mlrun/client_python_version=3.11.12
host=countapp-batch--handler-469fm
sample_data
endpoints=['model_0']
write_output=False
existing_data_handling=fail_on_overlap
stream_profile=None
model_0-d25a6714a19b4027b9bccfe8adca8ddc_NaT_NaT={'metric_name': 'count', 'metric_value': 4.0}
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-05 15:57:46,373 [info] Run execution finished: {\"name\":\"countapp-batch--handler\",\"status\":\"completed\"}\n" + ] + } + ], + "source": [ + "# Run the app as a job\n", + "res = count_events_app.CountApp.evaluate(func_path=\"count_events.py\",\n", + " run_local=False,\n", + " sample_data=pd.DataFrame({\"col\": [1, 2, 3, 4]}),\n", + " image=image,\n", + " endpoints=[\"model_0\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "504adb0b-6ccf-421c-98fc-25ed1a8691e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'model_0-d25a6714a19b4027b9bccfe8adca8ddc_NaT_NaT': {'metric_name': 'count',\n", + " 'metric_value': 4.0}}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res.outputs" + ] + }, + { + "cell_type": "markdown", + "id": "3a05a1c9-b62d-470a-9e18-4c3f5ca61b91", + "metadata": {}, + "source": [ + "Now that the application is available on your filesystem, you can register and deploy it just like any other custom application." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "28bc9645-69b2-418d-a5c5-7ba94f64745f", + "metadata": {}, + "outputs": [], + "source": [ + "fn = project.set_model_monitoring_function(\n", + " func=\"count_events.py\",\n", + " application_class=\"CountApp\",\n", + " name=\"CountEventsFromFile\",\n", + " image=image,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f318f85f-76d8-4494-8029-870edf54df6b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-05 16:09:48,293 [info] Starting remote function deploy\n", + "2025-11-05 16:09:48 (info) Deploying function\n", + "2025-11-05 16:09:48 (info) Building\n", + "2025-11-05 16:09:48 (info) Staging files and preparing base images\n", + "2025-11-05 16:09:48 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2025-11-05 16:09:48 (info) Building processor image\n", + "2025-11-05 16:11:33 (info) Build complete\n", + "2025-11-05 16:11:41 (info) Function deploy complete\n", + "> 2025-11-05 16:11:49,604 [info] Model endpoint creation task completed with state succeeded\n", + "> 2025-11-05 16:11:49,605 [info] Successfully deployed function: {\"external_invocation_urls\":[],\"internal_invocation_urls\":[\"nuclio-count-events-demo-counteventsfromfile.default-tenant.svc.cluster.local:8080\"]}\n" + ] + }, + { + "data": { + "text/plain": [ + "DeployStatus(state=ready, outputs={'endpoint': 'http://nuclio-count-events-demo-counteventsfromfile.default-tenant.svc.cluster.local:8080', 'name': 'count-events-demo-counteventsfromfile'})" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.deploy_function(fn)" + ] + }, + { + "cell_type": "markdown", + "id": "d2b527ee-19e6-4f89-9e51-702fa1707986", + "metadata": {}, + "source": [ + "## Set Application from Hub" + ] + }, + { + "cell_type": "markdown", + "id": "b8fa2433-535c-498b-a7ee-3d82d474d447", + "metadata": {}, + "source": [ + "As mentioned, you can set the application directly from the hub by providing a valid hub path (`hub://`)." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "61c50ac6-8dac-41a2-bb9c-705ab543e234", + "metadata": {}, + "outputs": [], + "source": [ + "fn = project.set_model_monitoring_function(\n", + " func=\"hub://count_events\",\n", + " application_class=\"CountApp\",\n", + " name=\"CountEvents\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "df313a94-d742-4ff6-8a28-8390322b8074", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-05 15:57:58,659 [info] Starting remote function deploy\n", + "2025-11-05 15:57:59 (info) Deploying function\n", + "2025-11-05 15:57:59 (info) Building\n", + "2025-11-05 15:57:59 (info) Staging files and preparing base images\n", + "2025-11-05 15:57:59 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2025-11-05 15:57:59 (info) Building processor image\n", + "2025-11-05 15:59:34 (info) Build complete\n", + "2025-11-05 15:59:42 (info) Function deploy complete\n", + "> 2025-11-05 15:59:49,826 [info] Model endpoint creation task completed with state succeeded\n", + "> 2025-11-05 15:59:49,827 [info] Successfully deployed function: {\"external_invocation_urls\":[],\"internal_invocation_urls\":[\"nuclio-count-events-demo-countevents.default-tenant.svc.cluster.local:8080\"]}\n" + ] + }, + { + "data": { + "text/plain": [ + "DeployStatus(state=ready, outputs={'endpoint': 'http://nuclio-count-events-demo-countevents.default-tenant.svc.cluster.local:8080', 'name': 'count-events-demo-countevents'})" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.deploy_function(fn)" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "mlrun-base-py311", "language": "python", - "name": "python3" + "name": "conda-env-mlrun-base-py311-py" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.11.12" } }, "nbformat": 4, diff --git a/modules/src/count_events/count_events.py b/modules/src/count_events/count_events.py index c2d6444e4..1c6d97621 100644 --- a/modules/src/count_events/count_events.py +++ b/modules/src/count_events/count_events.py @@ -20,9 +20,20 @@ class CountApp(ModelMonitoringApplicationBase): + """ + Model Monitoring Application that counts the number of events in the given time window. + """ def do_tracking( - self, monitoring_context: mm_context.MonitoringApplicationContext + self, + monitoring_context: mm_context.MonitoringApplicationContext ) -> ModelMonitoringApplicationMetric: + """" + he do_tracking method implementation for the CountApp class. + It counts the number of events in the sample data-frame and logs the count. + + :param monitoring_context: The monitoring application context. It includes the current window data as a + pandas data-frame: monitoring_context.sample_df. + """ sample_df = monitoring_context.sample_df monitoring_context.logger.debug("Sample data-frame", sample_df=sample_df) count = len(sample_df) diff --git a/modules/src/count_events/item.yaml b/modules/src/count_events/item.yaml index e5d796b62..049651ddb 100644 --- a/modules/src/count_events/item.yaml +++ b/modules/src/count_events/item.yaml @@ -7,11 +7,11 @@ generationDate: 2025-09-16:12-25 hidden: false labels: author: Iguazio -mlrunVersion: 1.10.0-rc27 +mlrunVersion: 1.10.0-rc41 name: count_events spec: filename: count_events.py image: mlrun/mlrun kind: monitoring_application requirements: -version: 1.0.0 +version: 1.0.0 \ No newline at end of file diff --git a/modules/src/count_events/requirements.txt b/modules/src/count_events/requirements.txt index 89741402a..0c107c276 100644 --- a/modules/src/count_events/requirements.txt +++ b/modules/src/count_events/requirements.txt @@ -1,3 +1,3 @@ -mlrun==1.10.0-rc27 +mlrun==1.10.0-rc41 pandas==2.1.4 -pytest~=8.2 +pytest~=8.2 \ No newline at end of file From 333d4e70c285aed40e393d7e86274cd87d68ce56 Mon Sep 17 00:00:00 2001 From: Eyal Danieli Date: Thu, 6 Nov 2025 11:50:58 +0200 Subject: [PATCH 04/17] avoid noise reduction unit test (#909) --- functions/src/noise_reduction/item.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/functions/src/noise_reduction/item.yaml b/functions/src/noise_reduction/item.yaml index d8f2cddd4..c37b4ab39 100644 --- a/functions/src/noise_reduction/item.yaml +++ b/functions/src/noise_reduction/item.yaml @@ -26,4 +26,5 @@ spec: torchaudio>=2.1.2, ] url: '' -version: 1.1.0 \ No newline at end of file +version: 1.1.0 +test_valid: False \ No newline at end of file From 77e28ba35d535ca552908433350a9522f4094c4a Mon Sep 17 00:00:00 2001 From: Daniel Perez <100069700+danielperezz@users.noreply.github.com> Date: Sun, 9 Nov 2025 11:11:36 +0200 Subject: [PATCH 05/17] Add histogram-data-drift monitoring application module (without example) (#911) * histogram data drift module with empty example notebook * post review fixes --- .../assets/feature_stats.csv | 23 ++ .../assets/sample_df_stats.csv | 23 ++ .../histogram_data_drift.ipynb | 31 ++ .../histogram_data_drift.py | 388 ++++++++++++++++++ modules/src/histogram_data_drift/item.yaml | 20 + .../src/histogram_data_drift/requirements.txt | 3 + .../test_histogram_data_drift.py | 279 +++++++++++++ 7 files changed, 767 insertions(+) create mode 100644 modules/src/histogram_data_drift/assets/feature_stats.csv create mode 100644 modules/src/histogram_data_drift/assets/sample_df_stats.csv create mode 100644 modules/src/histogram_data_drift/histogram_data_drift.ipynb create mode 100644 modules/src/histogram_data_drift/histogram_data_drift.py create mode 100644 modules/src/histogram_data_drift/item.yaml create mode 100644 modules/src/histogram_data_drift/requirements.txt create mode 100644 modules/src/histogram_data_drift/test_histogram_data_drift.py diff --git a/modules/src/histogram_data_drift/assets/feature_stats.csv b/modules/src/histogram_data_drift/assets/feature_stats.csv new file mode 100644 index 000000000..de76ff176 --- /dev/null +++ b/modules/src/histogram_data_drift/assets/feature_stats.csv @@ -0,0 +1,23 @@ +,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm +0,0.0,0.0,0.0,0.0 +1,0.02666666666666667,0.006666666666666667,0.02666666666666667,0.22666666666666666 +2,0.03333333333333333,0.02,0.22,0.04666666666666667 +3,0.04666666666666667,0.02666666666666667,0.07333333333333333,0.04666666666666667 +4,0.10666666666666667,0.02,0.013333333333333334,0.006666666666666667 +5,0.06,0.05333333333333334,0.0,0.006666666666666667 +6,0.03333333333333333,0.09333333333333334,0.0,0.0 +7,0.08666666666666667,0.09333333333333334,0.006666666666666667,0.0 +8,0.09333333333333334,0.06666666666666667,0.013333333333333334,0.04666666666666667 +9,0.06666666666666667,0.17333333333333334,0.02,0.02 +10,0.04,0.07333333333333333,0.03333333333333333,0.03333333333333333 +11,0.06666666666666667,0.12666666666666668,0.08,0.14 +12,0.10666666666666667,0.08,0.09333333333333334,0.08 +13,0.04666666666666667,0.04,0.08,0.02666666666666667 +14,0.07333333333333333,0.02666666666666667,0.11333333333333333,0.013333333333333334 +15,0.02666666666666667,0.06,0.04,0.08 +16,0.013333333333333334,0.013333333333333334,0.08,0.07333333333333333 +17,0.02666666666666667,0.006666666666666667,0.04666666666666667,0.04 +18,0.006666666666666667,0.006666666666666667,0.02666666666666667,0.02 +19,0.03333333333333333,0.006666666666666667,0.013333333333333334,0.05333333333333334 +20,0.006666666666666667,0.006666666666666667,0.02,0.04 +21,0.0,0.0,0.0,0.0 diff --git a/modules/src/histogram_data_drift/assets/sample_df_stats.csv b/modules/src/histogram_data_drift/assets/sample_df_stats.csv new file mode 100644 index 000000000..dc02ef3ba --- /dev/null +++ b/modules/src/histogram_data_drift/assets/sample_df_stats.csv @@ -0,0 +1,23 @@ +,p0,petal_length_cm,petal_width_cm,sepal_length_cm,sepal_width_cm +0,0.0,1.0,1.0,1.0,1.0 +1,0.0,0.0,0.0,0.0,0.0 +2,0.0,0.0,0.0,0.0,0.0 +3,0.0,0.0,0.0,0.0,0.0 +4,0.0,0.0,0.0,0.0,0.0 +5,0.0,0.0,0.0,0.0,0.0 +6,0.0,0.0,0.0,0.0,0.0 +7,0.0,0.0,0.0,0.0,0.0 +8,0.0,0.0,0.0,0.0,0.0 +9,0.0,0.0,0.0,0.0,0.0 +10,0.0,0.0,0.0,0.0,0.0 +11,1.0,0.0,0.0,0.0,0.0 +12,0.0,0.0,0.0,0.0,0.0 +13,0.0,0.0,0.0,0.0,0.0 +14,0.0,0.0,0.0,0.0,0.0 +15,0.0,0.0,0.0,0.0,0.0 +16,0.0,0.0,0.0,0.0,0.0 +17,0.0,0.0,0.0,0.0,0.0 +18,0.0,0.0,0.0,0.0,0.0 +19,0.0,0.0,0.0,0.0,0.0 +20,0.0,0.0,0.0,0.0,0.0 +21,0.0,0.0,0.0,0.0,0.0 diff --git a/modules/src/histogram_data_drift/histogram_data_drift.ipynb b/modules/src/histogram_data_drift/histogram_data_drift.ipynb new file mode 100644 index 000000000..54a15016a --- /dev/null +++ b/modules/src/histogram_data_drift/histogram_data_drift.ipynb @@ -0,0 +1,31 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": "# Histogram Data Drift Demo", + "id": "2517d91b275da01d" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/modules/src/histogram_data_drift/histogram_data_drift.py b/modules/src/histogram_data_drift/histogram_data_drift.py new file mode 100644 index 000000000..b8cdcf299 --- /dev/null +++ b/modules/src/histogram_data_drift/histogram_data_drift.py @@ -0,0 +1,388 @@ +# Copyright 2024 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Final, Optional, Protocol, Union, cast + +import numpy as np +from pandas import DataFrame, Series + +import mlrun.artifacts +import mlrun.common.model_monitoring.helpers +import mlrun.model_monitoring.applications.context as mm_context +import mlrun.model_monitoring.applications.results as mm_results +import mlrun.model_monitoring.features_drift_table as mm_drift_table +from mlrun.common.schemas.model_monitoring.constants import ( + ResultKindApp, + ResultStatusApp, + StatsKind, +) +from mlrun.model_monitoring.applications import ( + ModelMonitoringApplicationBase, +) +from mlrun.model_monitoring.metrics.histogram_distance import ( + HellingerDistance, + HistogramDistanceMetric, + KullbackLeiblerDivergence, + TotalVarianceDistance, +) + + +class InvalidMetricValueError(ValueError): + pass + + +class InvalidThresholdValueError(ValueError): + pass + + +class ValueClassifier(Protocol): + def value_to_status(self, value: float) -> ResultStatusApp: ... + + +class HistogramDataDriftApplicationConstants: + NAME = "histogram-data-drift" + GENERAL_RESULT_NAME = "general_drift" + + +@dataclass +class DataDriftClassifier: + """ + Classify data drift numeric values into categorical status. + """ + + potential: float = 0.5 + detected: float = 0.7 + + def __post_init__(self) -> None: + """Catch erroneous threshold values""" + if not 0 < self.potential < self.detected < 1: + raise InvalidThresholdValueError( + "The provided thresholds do not comply with the rules" + ) + + def value_to_status(self, value: float) -> ResultStatusApp: + """ + Translate the numeric value into status category. + + :param value: The numeric value of the data drift metric, between 0 and 1. + :returns: `ResultStatusApp` according to the classification. + """ + if value > 1 or value < 0: + raise InvalidMetricValueError( + f"{value = } is invalid, must be in the range [0, 1]." + ) + if value >= self.detected: + return ResultStatusApp.detected + if value >= self.potential: + return ResultStatusApp.potential_detection + return ResultStatusApp.no_detection + + +class HistogramDataDriftApplication(ModelMonitoringApplicationBase): + """ + MLRun's default data drift application for model monitoring. + + The application expects tabular numerical data, and calculates three metrics over the shared features' histograms. + The metrics are calculated on features that have reference data from the training dataset. When there is no + reference data (`feature_stats`), this application send a warning log and does nothing. + The three metrics are: + + * Hellinger distance. + * Total variance distance. + * Kullback-Leibler divergence. + + Each metric is calculated over all the features individually and the mean is taken as the metric value. + The average of Hellinger and total variance distance is taken as the result. + + The application can log two artifacts (disabled by default due to performance issues): + + * JSON with the general drift value per feature. + * Plotly table with the various metrics and histograms per feature. + + If you want to change the application defaults, such as the classifier or which artifacts to produce, you + can either modify the downloaded source code file directly, or inherit from this class (in the same file), then + deploy it as any other model monitoring application. + Please make sure to keep the default application name. This ensures that the full functionality of the application, + including the statistics view in the UI, is available. + """ + + NAME: Final[str] = HistogramDataDriftApplicationConstants.NAME + + _REQUIRED_METRICS = {HellingerDistance, TotalVarianceDistance} + _STATS_TYPES: tuple[StatsKind, StatsKind] = ( + StatsKind.CURRENT_STATS, + StatsKind.DRIFT_MEASURES, + ) + + metrics: list[type[HistogramDistanceMetric]] = [ + HellingerDistance, + KullbackLeiblerDivergence, + TotalVarianceDistance, + ] + + def __init__( + self, + value_classifier: Optional[ValueClassifier] = None, + produce_json_artifact: bool = False, + produce_plotly_artifact: bool = False, + ) -> None: + """ + :param value_classifier: Classifier object that adheres to the :py:class:`~ValueClassifier` protocol. + If not provided, the default :py:class:`~DataDriftClassifier` is used. + :param produce_json_artifact: Whether to produce the JSON artifact or not, ``False`` by default. + :param produce_plotly_artifact: Whether to produce the Plotly artifact or not, ``False`` by default. + """ + self._value_classifier = value_classifier or DataDriftClassifier() + assert self._REQUIRED_METRICS <= set( + self.metrics + ), "TVD and Hellinger distance are required for the general data drift result" + + self._produce_json_artifact = produce_json_artifact + self._produce_plotly_artifact = produce_plotly_artifact + + def _compute_metrics_per_feature( + self, monitoring_context: mm_context.MonitoringApplicationContext + ) -> DataFrame: + """Compute the metrics for the different features and labels""" + metrics_per_feature = DataFrame( + columns=[metric_class.NAME for metric_class in self.metrics] + ) + feature_stats = monitoring_context.dict_to_histogram( + monitoring_context.feature_stats + ) + sample_df_stats = monitoring_context.dict_to_histogram( + monitoring_context.sample_df_stats + ) + for feature_name in feature_stats: + sample_hist = np.asarray(sample_df_stats[feature_name]) + reference_hist = np.asarray(feature_stats[feature_name]) + monitoring_context.logger.info( + "Computing metrics for feature", feature_name=feature_name + ) + metrics_per_feature.loc[feature_name] = { # pyright: ignore[reportCallIssue,reportArgumentType] + metric.NAME: metric( + distrib_t=sample_hist, distrib_u=reference_hist + ).compute() + for metric in self.metrics + } + monitoring_context.logger.info("Finished computing the metrics") + + return metrics_per_feature + + def _get_general_drift_result( + self, metrics: list[mm_results.ModelMonitoringApplicationMetric] + ) -> mm_results.ModelMonitoringApplicationResult: + """Get the general drift result from the metrics list""" + value = cast( + float, + np.mean( + [ + metric.value + for metric in metrics + if metric.name + in [ + f"{HellingerDistance.NAME}_mean", + f"{TotalVarianceDistance.NAME}_mean", + ] + ] + ), + ) + + status = self._value_classifier.value_to_status(value) + + return mm_results.ModelMonitoringApplicationResult( + name=HistogramDataDriftApplicationConstants.GENERAL_RESULT_NAME, + value=value, + kind=ResultKindApp.data_drift, + status=status, + ) + + @staticmethod + def _get_metrics( + metrics_per_feature: DataFrame, + ) -> list[mm_results.ModelMonitoringApplicationMetric]: + """Average the metrics over the features and add the status""" + metrics: list[mm_results.ModelMonitoringApplicationMetric] = [] + + metrics_mean = metrics_per_feature.mean().to_dict() + + for name, value in metrics_mean.items(): + metrics.append( + mm_results.ModelMonitoringApplicationMetric( + name=f"{name}_mean", + value=value, + ) + ) + + return metrics + + @staticmethod + def _get_stats( + metrics: list[mm_results.ModelMonitoringApplicationMetric], + metrics_per_feature: DataFrame, + monitoring_context: mm_context.MonitoringApplicationContext, + ) -> list[mm_results._ModelMonitoringApplicationStats]: + """ + Return a list of the statistics. + + :param metrics: the calculated metrics + :param metrics_per_feature: metric calculated per feature + :param monitoring_context: context object for current monitoring application + :returns: list of mm_results._ModelMonitoringApplicationStats for histogram data drift application + """ + stats = [] + for stats_type in HistogramDataDriftApplication._STATS_TYPES: + stats.append( + mm_results._ModelMonitoringApplicationStats( + name=stats_type, + stats=metrics_per_feature.T.to_dict() + | {metric.name: metric.value for metric in metrics} + if stats_type == StatsKind.DRIFT_MEASURES + else monitoring_context.sample_df_stats, + timestamp=monitoring_context.end_infer_time.isoformat( + sep=" ", timespec="microseconds" + ), + ) + ) + return stats + + @staticmethod + def _get_shared_features_sample_stats( + monitoring_context: mm_context.MonitoringApplicationContext, + ) -> mlrun.common.model_monitoring.helpers.FeatureStats: + """ + Filter out features without reference data in `feature_stats`, e.g. `timestamp`. + """ + return mlrun.common.model_monitoring.helpers.FeatureStats( + { + key: monitoring_context.sample_df_stats[key] + for key in monitoring_context.feature_stats + } + ) + + @staticmethod + def _log_json_artifact( + drift_per_feature_values: Series, + monitoring_context: mm_context.MonitoringApplicationContext, + ) -> None: + """Log the drift values as a JSON artifact""" + monitoring_context.logger.debug("Logging drift value per feature JSON artifact") + monitoring_context.log_artifact( + mlrun.artifacts.Artifact( + body=drift_per_feature_values.to_json(), + format="json", + key="features_drift_results", + ) + ) + monitoring_context.logger.debug("Logged JSON artifact successfully") + + def _log_plotly_table_artifact( + self, + sample_set_statistics: mlrun.common.model_monitoring.helpers.FeatureStats, + inputs_statistics: mlrun.common.model_monitoring.helpers.FeatureStats, + metrics_per_feature: DataFrame, + drift_per_feature_values: Series, + monitoring_context: mm_context.MonitoringApplicationContext, + ) -> None: + """Log the Plotly drift table artifact""" + monitoring_context.logger.debug( + "Feature stats", + sample_set_statistics=sample_set_statistics, + inputs_statistics=inputs_statistics, + ) + + monitoring_context.logger.debug("Computing drift results per feature") + drift_results = { + cast(str, key): (self._value_classifier.value_to_status(value), value) + for key, value in drift_per_feature_values.items() + } + monitoring_context.logger.debug("Producing plotly artifact") + artifact = mm_drift_table.FeaturesDriftTablePlot().produce( + sample_set_statistics=sample_set_statistics, + inputs_statistics=inputs_statistics, + metrics=metrics_per_feature.T.to_dict(), # pyright: ignore[reportArgumentType] + drift_results=drift_results, + ) + monitoring_context.logger.debug("Logging plotly artifact") + monitoring_context.log_artifact(artifact) + monitoring_context.logger.debug("Logged plotly artifact successfully") + + def _log_drift_artifacts( + self, + monitoring_context: mm_context.MonitoringApplicationContext, + metrics_per_feature: DataFrame, + ) -> None: + """Log JSON and Plotly drift data per feature artifacts""" + if not self._produce_json_artifact and not self._produce_plotly_artifact: + return + + drift_per_feature_values = metrics_per_feature[ + [HellingerDistance.NAME, TotalVarianceDistance.NAME] + ].mean(axis=1) + + if self._produce_json_artifact: + self._log_json_artifact(drift_per_feature_values, monitoring_context) + + if self._produce_plotly_artifact: + self._log_plotly_table_artifact( + sample_set_statistics=self._get_shared_features_sample_stats( + monitoring_context + ), + inputs_statistics=monitoring_context.feature_stats, + metrics_per_feature=metrics_per_feature, + drift_per_feature_values=drift_per_feature_values, + monitoring_context=monitoring_context, + ) + + def do_tracking( + self, monitoring_context: mm_context.MonitoringApplicationContext + ) -> list[ + Union[ + mm_results.ModelMonitoringApplicationResult, + mm_results.ModelMonitoringApplicationMetric, + mm_results._ModelMonitoringApplicationStats, + ] + ]: + """ + Calculate and return the data drift metrics, averaged over the features. + """ + monitoring_context.logger.debug("Starting to run the application") + if not monitoring_context.feature_stats: + monitoring_context.logger.warning( + "No feature statistics found, skipping the application. \n" + "In order to run the application, training set must be provided when logging the model." + ) + return [] + metrics_per_feature = self._compute_metrics_per_feature( + monitoring_context=monitoring_context + ) + monitoring_context.logger.debug("Saving artifacts") + self._log_drift_artifacts( + monitoring_context=monitoring_context, + metrics_per_feature=metrics_per_feature, + ) + monitoring_context.logger.debug("Computing average per metric") + metrics = self._get_metrics(metrics_per_feature) + result = self._get_general_drift_result(metrics=metrics) + stats = self._get_stats( + metrics=metrics, + monitoring_context=monitoring_context, + metrics_per_feature=metrics_per_feature, + ) + metrics_result_and_stats = metrics + [result] + stats + monitoring_context.logger.debug( + "Finished running the application", results=metrics_result_and_stats + ) + return metrics_result_and_stats diff --git a/modules/src/histogram_data_drift/item.yaml b/modules/src/histogram_data_drift/item.yaml new file mode 100644 index 000000000..e439e1699 --- /dev/null +++ b/modules/src/histogram_data_drift/item.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +categories: +- model-serving +- structured-ML +description: Model-monitoring application for detecting and visualizing data drift +example: histogram_data_drift.ipynb +generationDate: 2025-11-06 +hidden: false +labels: + author: Iguazio +mlrunVersion: 1.10.0-rc41 +name: histogram_data_drift +spec: + filename: histogram_data_drift.py + image: mlrun/mlrun + kind: monitoring_application + requirements: + - plotly~=5.23 + - pandas +version: 1.0.0 \ No newline at end of file diff --git a/modules/src/histogram_data_drift/requirements.txt b/modules/src/histogram_data_drift/requirements.txt new file mode 100644 index 000000000..4c3614d2b --- /dev/null +++ b/modules/src/histogram_data_drift/requirements.txt @@ -0,0 +1,3 @@ +hypothesis[numpy]~=6.103 +plotly~=5.23 +pandas \ No newline at end of file diff --git a/modules/src/histogram_data_drift/test_histogram_data_drift.py b/modules/src/histogram_data_drift/test_histogram_data_drift.py new file mode 100644 index 000000000..018edaa86 --- /dev/null +++ b/modules/src/histogram_data_drift/test_histogram_data_drift.py @@ -0,0 +1,279 @@ +# Copyright 2024 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from pathlib import Path +from unittest.mock import Mock + +import pandas as pd +import pytest +from hypothesis import given +from hypothesis import strategies as st + +import mlrun.common.model_monitoring.helpers +import mlrun.model_monitoring.applications +import mlrun.model_monitoring.applications.context as mm_context +import mlrun.utils +from mlrun.common.schemas.model_monitoring.constants import ( + ResultKindApp, + ResultStatusApp, +) +from histogram_data_drift import ( + DataDriftClassifier, + HistogramDataDriftApplication, + InvalidMetricValueError, + InvalidThresholdValueError, +) + +assets_folder = Path(__file__).parent / "assets" + + +@pytest.fixture +def project(tmp_path: Path) -> mlrun.MlrunProject: + project = mlrun.get_or_create_project("temp", allow_cross_project=True) + project.artifact_path = str(tmp_path) + return project + + +@pytest.fixture +def application() -> HistogramDataDriftApplication: + app = HistogramDataDriftApplication( + produce_json_artifact=True, produce_plotly_artifact=True + ) + return app + + +@pytest.fixture +def logger() -> mlrun.utils.Logger: + return mlrun.utils.Logger(level=logging.DEBUG, name="test_histogram_data_drift_app") + + +class TestDataDriftClassifier: + @staticmethod + @pytest.mark.parametrize( + ("potential", "detected"), [(0.4, 0.2), (0.0, 0.5), (0.7, 1.0), (-1, 2)] + ) + def test_invalid_threshold(potential: float, detected: float) -> None: + with pytest.raises(InvalidThresholdValueError): + DataDriftClassifier(potential=potential, detected=detected) + + @staticmethod + @given( + st.one_of( + st.floats(max_value=0, exclude_max=True), + st.floats(min_value=1, exclude_min=True), + ) + ) + def test_invalid_metric(value: float) -> None: + with pytest.raises(InvalidMetricValueError): + DataDriftClassifier().value_to_status(value) + + @staticmethod + @pytest.fixture + def classifier() -> DataDriftClassifier: + return DataDriftClassifier(potential=0.5, detected=0.7) + + @staticmethod + @pytest.mark.parametrize( + ("value", "expected_status"), + [ + (0, ResultStatusApp.no_detection), + (0.2, ResultStatusApp.no_detection), + (0.5, ResultStatusApp.potential_detection), + (0.6, ResultStatusApp.potential_detection), + (0.71, ResultStatusApp.detected), + (1, ResultStatusApp.detected), + ], + ) + def test_status( + classifier: DataDriftClassifier, value: float, expected_status: ResultStatusApp + ) -> None: + assert ( + classifier.value_to_status(value) == expected_status + ), "The status is different than expected" + + +class TestApplication: + COUNT = 12 # the sample df size + + @classmethod + @pytest.fixture + def sample_df_stats(cls) -> mlrun.common.model_monitoring.helpers.FeatureStats: + return mlrun.common.model_monitoring.helpers.FeatureStats( + { + "timestamp": { + "count": cls.COUNT, + "25%": "2024-03-11 09:31:39.152301+00:00", + "50%": "2024-03-11 09:31:39.152301+00:00", + "75%": "2024-03-11 09:31:39.152301+00:00", + "max": "2024-03-11 09:31:39.152301+00:00", + "mean": "2024-03-11 09:31:39.152301+00:00", + "min": "2024-03-11 09:31:39.152301+00:00", + }, + "ticker": { + "count": cls.COUNT, + "unique": 1, + "top": "AAPL", + "freq": cls.COUNT, + }, + "f1": { + "count": cls.COUNT, + "hist": [[2, 3, 0, 3, 1, 3], [-10, -5, 0, 5, 10, 15, 20]], + }, + "f2": { + "count": cls.COUNT, + "hist": [[0, 6, 0, 2, 1, 3], [66, 67, 68, 69, 70, 71, 72]], + }, + "l": { + "count": cls.COUNT, + "hist": [ + [10, 0, 0, 0, 0, 2], + [0.0, 0.16, 0.33, 0.5, 0.67, 0.83, 1.0], + ], + }, + } + ) + + @staticmethod + @pytest.fixture + def feature_stats() -> mlrun.common.model_monitoring.helpers.FeatureStats: + return mlrun.common.model_monitoring.helpers.FeatureStats( + { + "f1": { + "count": 100, + "hist": [[0, 0, 0, 30, 70, 0], [-10, -5, 0, 5, 10, 15, 20]], + }, + "f2": { + "count": 100, + "hist": [[0, 45, 5, 15, 35, 0], [66, 67, 68, 69, 70, 71, 72]], + }, + "l": { + "count": 100, + "hist": [ + [30, 0, 0, 0, 0, 70], + [0.0, 0.16, 0.33, 0.5, 0.67, 0.83, 1.0], + ], + }, + } + ) + + @staticmethod + @pytest.fixture + def monitoring_context( + sample_df_stats: mlrun.common.model_monitoring.helpers.FeatureStats, + feature_stats: mlrun.common.model_monitoring.helpers.FeatureStats, + application: HistogramDataDriftApplication, + logger: mlrun.utils.Logger, + project: mlrun.MlrunProject, + ) -> mm_context.MonitoringApplicationContext: + monitoring_context = mm_context.MonitoringApplicationContext( + application_name=application.NAME, + event={}, + artifacts_logger=project, + logger=logger, + project=project, + nuclio_logger=logger, # the wrong type but works here + ) + monitoring_context._sample_df_stats = sample_df_stats + monitoring_context._feature_stats = feature_stats + + return monitoring_context + + @classmethod + def test( + cls, + application: HistogramDataDriftApplication, + monitoring_context: mm_context.MonitoringApplicationContext, + project: mlrun.MlrunProject, + ) -> None: + results = application.do_tracking(monitoring_context) + metrics = [] + assert len(results) == 6, "Expected four results & metrics % stats" + for res in results: + if isinstance( + res, + mlrun.model_monitoring.applications.ModelMonitoringApplicationResult, + ): + assert ( + res.kind == ResultKindApp.data_drift + ), "The kind should be data drift" + assert ( + res.name == "general_drift" + ), "The result name should be general_drift" + assert ( + res.status == ResultStatusApp.potential_detection + ), "Expected potential detection in the general drift" + elif isinstance( + res, + mlrun.model_monitoring.applications.ModelMonitoringApplicationMetric, + ): + metrics.append(res) + assert len(metrics) == 3, "Expected three metrics" + + # Check the artifacts + assert project._artifact_manager.artifact_uris.keys() == { + "features_drift_results", + "drift_table_plot", + }, "The artifacts in the artifact manager are different than expected" + assert {f.name for f in Path(project.artifact_path).glob("*")} == { + "drift_table_plot.html", + "features_drift_results.json", + }, "The artifact files were not found or are different than expected" + + +class TestMetricsPerFeature: + @staticmethod + @pytest.fixture + def monitoring_context( + logger: mlrun.utils.Logger, + ) -> mm_context.MonitoringApplicationContext: + ctx = Mock() + + def dict_to_histogram(df: pd.DataFrame) -> pd.DataFrame: + return df + + ctx.dict_to_histogram = dict_to_histogram + ctx.logger = logger + return ctx + + @staticmethod + @pytest.mark.parametrize( + ("sample_df_stats", "feature_stats"), + [ + pytest.param(pd.DataFrame(), pd.DataFrame(), id="empty-dfs"), + pytest.param( + pd.read_csv(assets_folder / "sample_df_stats.csv", index_col=0), + pd.read_csv(assets_folder / "feature_stats.csv", index_col=0), + id="real-world-csv-dfs", + ), + ], + ) + def test_compute_metrics_per_feature( + application: HistogramDataDriftApplication, + monitoring_context: Mock, + sample_df_stats: pd.DataFrame, + feature_stats: pd.DataFrame, + ) -> None: + monitoring_context.sample_df_stats = sample_df_stats + monitoring_context.feature_stats = feature_stats + + metrics_per_feature = application._compute_metrics_per_feature( + monitoring_context=monitoring_context + ) + assert set(metrics_per_feature.columns) == { + metric.NAME for metric in application.metrics + }, "Different metrics than expected" + assert set(metrics_per_feature.index) == set( + feature_stats.columns + ), "The features are different than expected" From 608112c442c8bcaf87626251ac454d17f1ea986f Mon Sep 17 00:00:00 2001 From: iguazio-cicd Date: Sun, 9 Nov 2025 09:12:43 +0000 Subject: [PATCH 06/17] chore(readme): auto-update asset tables [skip ci] --- modules/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/README.md b/modules/README.md index a14dbf5bb..05e7cfefd 100644 --- a/modules/README.md +++ b/modules/README.md @@ -7,4 +7,5 @@ | Name | Description | Kind | Categories | | --- | --- | --- | --- | | [count_events](/home/runner/work/functions/functions/modules/src/count_events) | Count events in each time window | monitoring_application | model-serving | +| [histogram_data_drift](/home/runner/work/functions/functions/modules/src/histogram_data_drift) | Model-monitoring application for detecting and visualizing data drift | monitoring_application | model-serving, structured-ML | From c56ef485ff021f658c7b6b2384bd6d5ff9f2246e Mon Sep 17 00:00:00 2001 From: Daniel Perez <100069700+danielperezz@users.noreply.github.com> Date: Sun, 9 Nov 2025 19:54:20 +0200 Subject: [PATCH 07/17] Fill histogram-data-drift example notebook (#912) * fill data-drift nb * post review fixes --- .../histogram_data_drift.ipynb | 292 +++++++++++++++++- 1 file changed, 285 insertions(+), 7 deletions(-) diff --git a/modules/src/histogram_data_drift/histogram_data_drift.ipynb b/modules/src/histogram_data_drift/histogram_data_drift.ipynb index 54a15016a..eceb28ca3 100644 --- a/modules/src/histogram_data_drift/histogram_data_drift.ipynb +++ b/modules/src/histogram_data_drift/histogram_data_drift.ipynb @@ -1,29 +1,307 @@ { "cells": [ { + "cell_type": "markdown", + "id": "283b6000-4acd-4eb3-bf51-25ee79e9e5dc", + "metadata": {}, + "source": [ + "# Histogram Data Drift Demo\n", + "The Histogram Data Drift monitoring app is MLRun’s default data drift application for model monitoring. It’s considered a built-in app within the model monitoring flow and is deployed by default when model monitoring is enabled for a project. For more information, see the [MLRun documentation](https://docs.mlrun.org/en/latest/model-monitoring/index.html#model-monitoring-applications).\n", + "\n", + "This notebook walks through a simple example of using this app from the hub to monitor data drift between a baseline dataset and a new dataset, using the `evaluate()` method." + ] + }, + { + "cell_type": "markdown", + "id": "da432405-e8bb-400c-b1e0-45e31b0571f1", + "metadata": {}, + "source": [ + "## Set up a project and prepare the data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "62fcc7a4-4df5-4f2e-bd97-6aa831bbf958", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n", + "project = mlrun.get_or_create_project(\"histogram-data-drift-demo\",'./histogram-data-drift-demo')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d7ec1628-0303-4bbb-ba34-5cd96eaef304", + "metadata": {}, + "outputs": [], + "source": [ + "sample_data = mlrun.get_sample_path(\"data/batch-predict/training_set.parquet\")\n", + "reference_data = mlrun.get_sample_path(\"data/batch-predict/prediction_set.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "072f1411-33a2-444e-88bf-76d9394d7877", + "metadata": {}, + "source": [ + "## Get the module from the hub and edit its defaults" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5c04dec9-ea6e-410e-a36d-42a71a223caa", + "metadata": {}, + "outputs": [], + "source": [ + "hub_mod = mlrun.get_hub_module(\"hub://histogram_data_drift\", download_files=True)\n", + "src_file_path = hub_mod.get_module_file_path()" + ] + }, + { + "cell_type": "markdown", + "id": "ce26e487-bfe5-442c-9d5a-04a8d75407a6", + "metadata": {}, + "source": [ + "Since the histogram data drift application doesn’t produce artifacts by default, we need to modify the class defaults. This can be done in one of two ways: either by editing the downloaded source file directly and then evaluating with the standard class, or - as we’ll do now - by adding an inheriting class to the same file and evaluating using that new class." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "055a31d8-00fd-4f55-b07c-1169db6af919", + "metadata": {}, + "outputs": [], + "source": [ + "# add a declaration of an inheriting class to change the default parameters\n", + "wrapper_code = \"\"\"\n", + "class HistogramDataDriftApplicationWithArtifacts(HistogramDataDriftApplication):\n", + " # The same histogram application but with artifacts\n", + "\n", + " def __init__(self) -> None:\n", + " super().__init__(produce_json_artifact=True, produce_plotly_artifact=True)\n", + "\"\"\"\n", + "with open(src_file_path, \"a\") as f:\n", + " f.write(wrapper_code)" + ] + }, + { + "cell_type": "markdown", + "id": "c17b176b-f838-472f-aaeb-7cedaeb66b56", + "metadata": {}, + "source": [ + "Now we can actually import it as a module, using the `module()` method" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6f57d3c9-9e7e-4fde-b78b-2daf799893e1", + "metadata": {}, + "outputs": [], + "source": [ + "app_module = hub_mod.module()\n", + "hist_app = app_module.HistogramDataDriftApplicationWithArtifacts # or the standard class if you chose to modify its code" + ] + }, + { + "cell_type": "markdown", + "id": "a017bc5a-4935-456b-8648-57c11e11df27", + "metadata": {}, + "source": [ + "And we are ready to call `evaluate()` (notice that the run is linked to the current (active) project)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c20fc990-d0e6-4aab-a576-29cea322bfb5", "metadata": {}, + "outputs": [], + "source": [ + "run_result = hist_app.evaluate(\n", + " func_path=hub_mod.get_module_file_path(),\n", + " sample_data=sample_data,\n", + " reference_data=reference_data,\n", + " run_local=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "661cdf4d-ee2a-4156-8a71-59f2a1e3b9eb", + "metadata": {}, + "source": [ + "## Examine the results" + ] + }, + { "cell_type": "markdown", - "source": "# Histogram Data Drift Demo", - "id": "2517d91b275da01d" + "id": "e715b6aa-75c0-4352-b98f-bd5a790e1d06", + "metadata": {}, + "source": [ + "First, we'll print nicely the average results:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3688d6a0-6cae-4141-8851-dfd12842c484", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hellinger_mean : 0.34211088243167637\n", + "kld_mean : 2.2839485090490426\n", + "tvd_mean : 0.30536\n", + "general_drift : 0.3237354412158382\n" + ] + } + ], + "source": [ + "for i in range (3):\n", + " metric = run_result.status.results[\"return\"][i]\n", + " print(metric[\"metric_name\"], \": \", metric[\"metric_value\"])\n", + "result = run_result.status.results[\"return\"][3]\n", + "print(result[\"result_name\"], \": \", result[\"result_value\"])" + ] + }, + { + "cell_type": "markdown", + "id": "0422ca13-661b-4574-ad51-d1665be6acdb", + "metadata": {}, + "source": [ + "And we can also examine these metrics per feature, along with other metrics, using the artifacts the app generated for us.\n", + "\n", + "The rightmost column indicates whether the feature has drifted or not. The drift decision rule is the value per-feature mean of the Total Variance Distance (TVD) and Hellinger distance scores. In the histogram-data-drift application, the \"Drift detected\" threshold is 0.7 and the \"Drift suspected\" threshold is 0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d9e7e688-6a71-4b9b-8b99-b2d7f42077e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# The artifact is logged with the run's name\n", + "artifact_key = f\"{run_result.metadata.name}_drift_table_plot\"\n", + "artifact = project.get_artifact(artifact_key)\n", + "artifact.to_dataitem().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f8a17a07-6cc4-4bf3-abd8-187042b1973a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Drift value per feature:\n" + ] + }, + { + "data": { + "application/json": { + "feature_0": 0.034754757, + "feature_1": 0.0409220715, + "feature_10": 0.0529929347, + "feature_11": 0.7582778852, + "feature_12": 0.7680105477, + "feature_13": 0.0359189896, + "feature_14": 0.0388433161, + "feature_15": 0.6959895187, + "feature_16": 0.7682657628, + "feature_17": 0.0381781891, + "feature_18": 0.032682812, + "feature_19": 0.7400673333, + "feature_2": 0.7365591239, + "feature_3": 0.0492651761, + "feature_4": 0.0373909913, + "feature_5": 0.0374548709, + "feature_6": 0.7788618285, + "feature_7": 0.7443223594, + "feature_8": 0.0381141123, + "feature_9": 0.0478362439 + }, + "text/plain": [ + "" + ] + }, + "metadata": { + "application/json": { + "expanded": false, + "root": "root" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "print(\"Drift value per feature:\")\n", + "artifact_key = f\"{run_result.metadata.name}_features_drift_results\"\n", + "artifact = project.get_artifact(artifact_key)\n", + "artifact.to_dataitem().show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a8767ca-8a65-4841-9ced-4f36e86bb789", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "mlrun-base-py311", "language": "python", - "name": "python3" + "name": "conda-env-mlrun-base-py311-py" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.11.12" } }, "nbformat": 4, From 9884e8556f348d5b9ea39ef7942c7667af599598 Mon Sep 17 00:00:00 2001 From: Daniel Perez <100069700+danielperezz@users.noreply.github.com> Date: Tue, 11 Nov 2025 15:26:32 +0200 Subject: [PATCH 08/17] Add evidently demo app monitoring application module (without example) (#913) * sphinx build docs bug fix * add evidently demo app module (empty example notebook) * post review changes --- cli/marketplace/conf.template | 10 +- modules/src/evidently/evidently_iris.ipynb | 37 ++++++ modules/src/evidently/evidently_iris.py | 117 +++++++++++++++++++ modules/src/evidently/item.yaml | 21 ++++ modules/src/evidently/requirements.txt | 3 + modules/src/evidently/test_evidently_iris.py | 72 ++++++++++++ 6 files changed, 258 insertions(+), 2 deletions(-) create mode 100644 modules/src/evidently/evidently_iris.ipynb create mode 100644 modules/src/evidently/evidently_iris.py create mode 100644 modules/src/evidently/item.yaml create mode 100644 modules/src/evidently/requirements.txt create mode 100644 modules/src/evidently/test_evidently_iris.py diff --git a/cli/marketplace/conf.template b/cli/marketplace/conf.template index 93c83c9d3..e26f065aa 100644 --- a/cli/marketplace/conf.template +++ b/cli/marketplace/conf.template @@ -15,8 +15,14 @@ import re import sys import os -sys.path.insert(0, "{{sphinx_docs_target}}") -sys.path.insert(0, os.path.abspath(os.path.join("{{sphinx_docs_target}}", "../functions"))) +import pathlib + +DOCS_DIR = pathlib.Path(__file__).resolve().parent +REPO_ROOT = DOCS_DIR.parent + +# Add both source trees +sys.path.insert(0, str(REPO_ROOT / "functions")) +sys.path.insert(0, str(REPO_ROOT / "modules")) # -- Project information ----------------------------------------------------- diff --git a/modules/src/evidently/evidently_iris.ipynb b/modules/src/evidently/evidently_iris.ipynb new file mode 100644 index 000000000..54f657bb0 --- /dev/null +++ b/modules/src/evidently/evidently_iris.ipynb @@ -0,0 +1,37 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/modules/src/evidently/evidently_iris.py b/modules/src/evidently/evidently_iris.py new file mode 100644 index 000000000..e7a9f3ef9 --- /dev/null +++ b/modules/src/evidently/evidently_iris.py @@ -0,0 +1,117 @@ +# Copyright 2025 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import pandas as pd +from sklearn.datasets import load_iris + +import mlrun.model_monitoring.applications.context as mm_context +from mlrun.common.schemas.model_monitoring.constants import ( + ResultKindApp, + ResultStatusApp, +) +from mlrun.feature_store.api import norm_column_name +from mlrun.model_monitoring.applications import ModelMonitoringApplicationResult +from mlrun.model_monitoring.applications.evidently import EvidentlyModelMonitoringApplicationBase + +from evidently.core.report import Report, Snapshot +from evidently.metrics import DatasetMissingValueCount, ValueDrift +from evidently.presets import DataDriftPreset, DataSummaryPreset +from evidently.ui.workspace import ( + STR_UUID, + OrgID, +) + +_PROJECT_NAME = "Iris Monitoring" +_PROJECT_DESCRIPTION = "Test project using iris dataset" + + +class EvidentlyIrisMonitoringApp(EvidentlyModelMonitoringApplicationBase): + """ + This model monitoring application is a simple example of integrating MLRun with Evidently for data monitoring, + which you can adapt to fit your own project needs or use as a reference implementation. + """ + NAME = "Evidently-App-Test" + + def __init__( + self, + evidently_project_id: Optional["STR_UUID"] = None, + evidently_workspace_path: Optional[str] = None, + cloud_workspace: bool = False, + evidently_organization_id: Optional["OrgID"] = None, + ) -> None: + self.org_id = evidently_organization_id + self._init_iris_data() + super().__init__( + evidently_project_id=evidently_project_id, + evidently_workspace_path=evidently_workspace_path, + cloud_workspace=cloud_workspace, + ) + + def _init_iris_data(self) -> None: + iris = load_iris() + self.columns = [norm_column_name(col) for col in iris.feature_names] + self.train_set = pd.DataFrame(iris.data, columns=self.columns) + + def do_tracking( + self, monitoring_context: mm_context.MonitoringApplicationContext + ) -> ModelMonitoringApplicationResult: + monitoring_context.logger.info("Running evidently app") + + sample_df = monitoring_context.sample_df[self.columns] + + data_drift_report_run = self.create_report_run( + sample_df, monitoring_context.end_infer_time + ) + self.evidently_workspace.add_run( + self.evidently_project_id, data_drift_report_run + ) + + self.log_evidently_object( + monitoring_context, data_drift_report_run, "evidently_report" + ) + monitoring_context.logger.info("Logged evidently object") + + return ModelMonitoringApplicationResult( + name="data_drift_test", + value=0.5, + kind=ResultKindApp.data_drift, + status=ResultStatusApp.potential_detection, + ) + + def create_report_run( + self, sample_df: pd.DataFrame, schedule_time: pd.Timestamp + ) -> "Snapshot": + metrics = [ + DataDriftPreset(), + DatasetMissingValueCount(), + DataSummaryPreset(), + ] + metrics.extend( + [ + ValueDrift(column=col_name, method="wasserstein") + for col_name in self.columns + ] + ) + + data_drift_report = Report( + metrics=metrics, + metadata={"timestamp": str(schedule_time)}, + include_tests=True, + ) + + return data_drift_report.run( + current_data=sample_df, reference_data=self.train_set + ) diff --git a/modules/src/evidently/item.yaml b/modules/src/evidently/item.yaml new file mode 100644 index 000000000..c6a2abc2c --- /dev/null +++ b/modules/src/evidently/item.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +categories: +- model-serving +- structured-ML +description: Demonstrates Evidently integration in MLRun for data quality and drift monitoring using the Iris dataset +example: evidently_iris.ipynb +generationDate: 2025-11-09 +hidden: false +labels: + author: Iguazio +mlrunVersion: 1.10.0-rc41 +name: evidently_iris +spec: + filename: evidently_iris.py + image: mlrun/mlrun + kind: monitoring_application + requirements: + - scikit-learn~=1.5.2 + - evidently~=0.7.6 + - pandas +version: 1.0.0 \ No newline at end of file diff --git a/modules/src/evidently/requirements.txt b/modules/src/evidently/requirements.txt new file mode 100644 index 000000000..bd4abb36f --- /dev/null +++ b/modules/src/evidently/requirements.txt @@ -0,0 +1,3 @@ +scikit-learn~=1.5.2 +evidently~=0.7.6 +pandas \ No newline at end of file diff --git a/modules/src/evidently/test_evidently_iris.py b/modules/src/evidently/test_evidently_iris.py new file mode 100644 index 000000000..6488768fd --- /dev/null +++ b/modules/src/evidently/test_evidently_iris.py @@ -0,0 +1,72 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from contextlib import AbstractContextManager +from contextlib import nullcontext as does_not_raise +from pathlib import Path +from uuid import uuid4 + +import pytest +import semver + +from mlrun.errors import MLRunIncompatibleVersionError +from mlrun.model_monitoring.applications.evidently.base import ( + _check_evidently_version, +) + +from evidently_iris import EvidentlyIrisMonitoringApp + + +@pytest.mark.parametrize( + ("cur", "ref", "expectation"), + [ + ("0.4.11", "0.4.11", does_not_raise()), + ("0.4.12", "0.4.11", does_not_raise()), + ("1.23.0", "1.1.32", does_not_raise()), + ("0.4.11", "0.4.12", pytest.raises(MLRunIncompatibleVersionError)), + ("0.4.11", "0.4.12", pytest.raises(MLRunIncompatibleVersionError)), + ("1.0.3", "0.9.9", pytest.raises(MLRunIncompatibleVersionError)), + ("0.6.0", "0.3.0", pytest.warns(UserWarning)), + pytest.param("0.6.0", "0.3.0", does_not_raise(), marks=pytest.mark.xfail), + ], +) +def test_version_check( + cur: str, + ref: str, + expectation: AbstractContextManager, +) -> None: + with warnings.catch_warnings(): + warnings.simplefilter("error") + with expectation: + _check_evidently_version( + cur=semver.Version.parse(cur), ref=semver.Version.parse(ref) + ) + + +def test_demo_evidently_app(tmpdir: Path) -> None: + """Test that the workspace and the project's dashboards are created""" + evidently_app = EvidentlyIrisMonitoringApp( + evidently_project_id=uuid4(), evidently_workspace_path=str(tmpdir) + ) + run = evidently_app.create_report_run( + sample_df=evidently_app.train_set, schedule_time=None + ) + added_run_uid = evidently_app.evidently_workspace.add_run( + project_id=evidently_app.evidently_project_id, + run=run, + ).id + assert evidently_app.evidently_workspace.list_runs( + project_id=evidently_app.evidently_project_id + ) == [added_run_uid], "Different project runs than expected" From 659b7910f1100a1807d2878a2e1d602d87001e72 Mon Sep 17 00:00:00 2001 From: iguazio-cicd Date: Tue, 11 Nov 2025 13:29:54 +0000 Subject: [PATCH 09/17] chore(readme): auto-update asset tables [skip ci] --- modules/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/README.md b/modules/README.md index 05e7cfefd..f49c8472f 100644 --- a/modules/README.md +++ b/modules/README.md @@ -7,5 +7,6 @@ | Name | Description | Kind | Categories | | --- | --- | --- | --- | | [count_events](/home/runner/work/functions/functions/modules/src/count_events) | Count events in each time window | monitoring_application | model-serving | +| [evidently](/home/runner/work/functions/functions/modules/src/evidently) | Demonstrates Evidently integration in MLRun for data quality and drift monitoring using the Iris dataset | monitoring_application | model-serving, structured-ML | | [histogram_data_drift](/home/runner/work/functions/functions/modules/src/histogram_data_drift) | Model-monitoring application for detecting and visualizing data drift | monitoring_application | model-serving, structured-ML | From ce1999315d5e1fb1e2e4dc317ed0cec0849fab2e Mon Sep 17 00:00:00 2001 From: Daniel Perez <100069700+danielperezz@users.noreply.github.com> Date: Sun, 16 Nov 2025 12:59:17 +0200 Subject: [PATCH 10/17] [Translate] Require torch>=2.6 for the translate function to work properly (#915) * lock torch valid version * edit the item.yaml and generated function.yaml * update mlrun version --- functions/src/translate/function.yaml | 41 ++++++++++++------------ functions/src/translate/item.yaml | 6 ++-- functions/src/translate/requirements.txt | 2 +- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/functions/src/translate/function.yaml b/functions/src/translate/function.yaml index 9595b77a3..eb1ffd345 100644 --- a/functions/src/translate/function.yaml +++ b/functions/src/translate/function.yaml @@ -1,4 +1,8 @@ +verbose: false spec: + description: Translate text files from one language to another + filename: /Users/Daniel_Perez/PycharmProjects/functions/functions/src/translate/translate.py + command: '' entry_points: open_mpi_handler: lineno: 56 @@ -8,24 +12,24 @@ spec: - name: root_worker_inputs type: Dict[str, Any] default: null - name: open_mpi_handler - has_kwargs: false doc: '' + has_kwargs: false has_varargs: false + name: open_mpi_handler decorator: lineno: 68 parameters: - name: handler - name: decorator - has_kwargs: false doc: '' + has_kwargs: false has_varargs: false + name: decorator wrapper: lineno: 73 - name: wrapper - has_kwargs: true doc: '' + has_kwargs: true has_varargs: false + name: wrapper translate: outputs: - doc: 'A tuple of:' @@ -75,8 +79,6 @@ spec: type: bool doc: 'Whether to present logs of a progress bar and errors. Default: True.' default: false - name: translate - has_kwargs: false doc: 'Translate text files using a transformer model from Huggingface''s hub according to the source and target languages @@ -89,27 +91,26 @@ spec: * text_file - The text file path. * translation_file - The translation text file name in the output directory.' + has_kwargs: false has_varargs: false + name: translate + disable_auto_mount: false + image: '' + default_handler: translate build: + functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import operator
import pathlib
from functools import reduce, wraps
from typing import Any, Dict, List, Tuple, Union

import pandas as pd
import transformers
from tqdm import tqdm

# Get the global logger:
_LOGGER = logging.getLogger()


def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    is_mpi = False
    try:
        import mlrun

        context = mlrun.get_or_create_ctx(name="mlrun")
        is_mpi = context.labels.get("kind", "job") == "mpijob"

        if is_mpi:
            try:
                from mpi4py import MPI

                return context, MPI.COMM_WORLD
            except ModuleNotFoundError as mpi4py_not_found:
                context.logger.error(
                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
                )
                raise mpi4py_not_found
        else:
            return context, None
    except ModuleNotFoundError as module_not_found:
        if is_mpi:
            raise module_not_found
    return None, None


def open_mpi_handler(
    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
):
    global _LOGGER

    # Check for MLRun and OpenMPI availability:
    context, comm = _check_mlrun_and_open_mpi()

    # Check if MLRun is available, set the global logger to MLRun's:
    if context:
        _LOGGER = context.logger

    def decorator(handler):
        if comm is None or comm.Get_size() == 1:
            return handler

        @wraps(handler)
        def wrapper(**kwargs):
            # Get the open mpi environment properties:
            size = comm.Get_size()
            rank = comm.Get_rank()

            # Give the correct chunk of the workers inputs:
            for worker_input in worker_inputs:
                input_argument = kwargs[worker_input]
                if input_argument is None:
                    continue
                if isinstance(input_argument, (str, pathlib.Path)):
                    input_argument = _get_text_files(
                        data_path=pathlib.Path(input_argument).absolute()
                    )
                if len(input_argument) < size:
                    raise ValueError(
                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
                        f"Please reduce the amount of workers for this input."
                    )
                even_chunk_size = len(input_argument) // size
                chunk_start = rank * even_chunk_size
                chunk_end = (
                    (rank + 1) * even_chunk_size
                    if rank + 1 < size
                    else len(input_argument)
                )
                context.logger.info(
                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
                    f"from index {chunk_start} to {chunk_end}."
                )
                if isinstance(input_argument, list):
                    input_argument = input_argument[chunk_start:chunk_end]
                elif isinstance(input_argument, pd.DataFrame):
                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
                kwargs[worker_input] = input_argument

            # Set the root worker only arguments:
            if rank == 0 and root_worker_inputs:
                kwargs.update(root_worker_inputs)

            # Run the worker:
            output = handler(**kwargs)

            # Send the output to the root rank (rank #0):
            output = comm.gather(output, root=0)
            if rank == 0:
                # Join the outputs:
                context.logger.info("Collecting data from workers to root worker.")
                output_directory = output[0][0]
                dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0)
                errors_dictionary = reduce(
                    operator.ior, [err for _, _, err in output], {}
                )
                return output_directory, dataframe, errors_dictionary
            return None

        return wrapper

    return decorator


@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
def translate(
    data_path: Union[str, List[str], pathlib.Path],
    output_directory: str,
    model_name: str = None,
    source_language: str = None,
    target_language: str = None,
    device: str = None,
    model_kwargs: dict = None,
    batch_size: int = 1,
    translation_kwargs: dict = None,
    verbose: bool = False,
) -> Tuple[str, pd.DataFrame, dict]:
    """
    Translate text files using a transformer model from Huggingface's hub according to the source and target languages
    given (or using the directly provided model name). The end result is a directory of translated text files and a
    dataframe containing the following columns:

    * text_file - The text file path.
    * translation_file - The translation text file name in the output directory.

    :param data_path:          A directory of text files or a single file or a list of files to translate.
    :param output_directory:   Directory where the translated files will be saved.
    :param model_name:         The name of a model to load. If None, the model name is constructed using the source and
                               target languages parameters.
    :param source_language:    The source language code (e.g., 'en' for English).
    :param target_language:    The target language code (e.g., 'en' for English).
    :param model_kwargs:       Keyword arguments to pass regarding the loading of the model in HuggingFace's `pipeline`
                               function.
    :param device:             The device index for transformers. Default will prefer cuda if available.
    :param batch_size:         The number of batches to use in translation. The files are translated one by one, but the
                               sentences can be batched.
    :param translation_kwargs: Additional keyword arguments to pass to a `transformers.TranslationPipeline` when doing
                               the translation inference. Notice the batch size here is being added automatically.
    :param verbose:            Whether to present logs of a progress bar and errors. Default: True.

    :returns: A tuple of:

              * Path to the output directory.
              * A dataframe dataset of the translated file names.
              * A dictionary of errored files that were not translated.
    """
    global _LOGGER

    # Get the input text files to translate:
    if verbose:
        _LOGGER.info("Collecting text files.")
    if isinstance(data_path, str):
        data_path = pathlib.Path(data_path).absolute()
        text_files = _get_text_files(data_path=data_path)
    else:
        text_files = data_path
    if verbose:
        _LOGGER.info(f"Collected {len(text_files)} text files.")

    # Get the translation pipeline:
    if verbose:
        _LOGGER.info(f"Loading model - using device '{device}'.")
    translation_pipeline, model_name = _get_translation_pipeline(
        model_name=model_name,
        source_language=source_language,
        target_language=target_language,
        device=device,
        model_kwargs=model_kwargs,
        batch_size=batch_size if batch_size != 1 else None,
    )
    if verbose:
        _LOGGER.info(f"Model '{model_name}' was loaded successfully.")

    # Prepare the successes dataframe and errors dictionary to be returned:
    successes = []
    errors = {}

    # Create the output directory:
    output_directory = pathlib.Path(output_directory)
    output_directory.mkdir(parents=True, exist_ok=True)

    # Prepare the translation keyword arguments:
    translation_kwargs = translation_kwargs or {}

    # Go over the audio files and transcribe:
    for text_file in tqdm(
        text_files, desc="Translating", unit="file", disable=not verbose
    ):
        try:
            # Translate:
            translation = _translate(
                text_file=text_file,
                translation_pipeline=translation_pipeline,
                translation_kwargs=translation_kwargs,
            )
            # Write the transcription to file:
            translation_file = _save_to_file(
                translation=translation,
                file_name=text_file.stem,
                output_directory=output_directory,
            )
            # Note as a success in the list:
            successes.append(
                [
                    text_file.name,
                    translation_file.name,
                ]
            )
        except Exception as exception:
            # Note the exception as error in the dictionary:
            if verbose:
                _LOGGER.warning(f"Error in file: '{text_file.name}'")
            errors[str(text_file.name)] = str(exception)
            continue

    # Construct the translations dataframe:
    columns = [
        "text_file",
        "translation_file",
    ]
    successes = pd.DataFrame(
        successes,
        columns=columns,
    )

    # Print the head of the produced dataframe and return:
    if verbose:
        _LOGGER.info(
            f"Done ({successes.shape[0]}/{len(text_files)})\n"
            f"Translations summary:\n"
            f"{successes.head()}"
        )
    return str(output_directory), successes, errors


def _get_text_files(
    data_path: pathlib.Path,
) -> List[pathlib.Path]:
    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        text_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        text_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
            f"Given: {str(data_path)} "
        )

    return text_files


def _get_translation_pipeline(
    model_name: str = None,
    source_language: str = None,
    target_language: str = None,
    device: str = None,
    model_kwargs: dict = None,
    batch_size: int = None,
) -> Tuple[transformers.Pipeline, str]:
    # Construct the model name - if model name is provided (not None) then we take it, otherwise we check both source
    # and target were provided to construct the model name:
    if model_name is None and (source_language is None or target_language is None):
        raise ValueError(
            "No model name were given and missing source and / or target languages. In order to translate you must "
            "pass a `model_name` or both `source_language` and `target_language`."
        )
    elif model_name is None:
        model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"

    # Initialize the translation pipeline:
    try:
        translation_pipeline = transformers.pipeline(
            task="translation",
            model=model_name,
            tokenizer=model_name,
            device=device,
            model_kwargs=model_kwargs,
            batch_size=batch_size,
        )
    except OSError as load_exception:
        if (
            "is not a valid model identifier listed on 'https://huggingface.co/models'"
            in str(load_exception)
            and source_language
        ):
            raise ValueError(
                f"The model '{model_name}' is not a valid model identifier. "
                f"The parameters `source_language` and `target_language` are used to construct a Helsinki model for "
                f"text to text generation, but the model created from the given languages does not exist. "
                f"You may check language identifiers at "
                f"https://developers.google.com/admin-sdk/directory/v1/languages, and if the error was not fixed, one "
                f"or more language code might be with 3 letters and needs to be found online. "
                f"Remember, you can always choose a model directly from the Huggingface hub by using the `model_name` "
                f"parameter."
            ) from load_exception
        raise load_exception

    return translation_pipeline, model_name


def _translate(
    text_file: pathlib.Path,
    translation_pipeline: transformers.Pipeline,
    translation_kwargs: dict,
) -> str:
    # Read the text from file:
    with open(text_file, "r") as fp:
        text = fp.read()

    # Split to paragraphs and each paragraph to sentences:
    paragraphs = [paragraph.split(".") for paragraph in text.split("\n")]

    # Discover the newline indexes to restore the file to its structure post translation:
    newlines_indexes = []
    for paragraph in paragraphs[:-1]:
        if len(newlines_indexes) == 0:
            newlines_indexes.append(len(paragraph) - 1)
        else:
            newlines_indexes.append(newlines_indexes[-1] + len(paragraph))

    # Prepare the batches (each sentence from the paragraphs). Notice we add a dot not only to restore the sentence
    # structure but to ignore empty strings as it will ruin the translation:
    sentences = [f"{line}." for paragraph in paragraphs for line in paragraph]

    # Translate the sentences:
    translations = translation_pipeline(sentences, **translation_kwargs)

    # Restructure the full text from the sentences:
    translated_text = []
    newline_index = newlines_indexes.pop(0) if newlines_indexes else None
    for i, translation in enumerate(translations):
        # Get the translation:
        text = translation["translation_text"]
        # Validate if it was an empty sentence before:
        if text == ".":
            text = ""
        # Check if needed to insert a newline:
        if newline_index and newline_index == i:
            text += "\n"
            newline_index = newlines_indexes.pop(0) if newlines_indexes else None
        # Collect it:
        translated_text.append(text)
    translated_text = "".join(translated_text)

    return translated_text


def _save_to_file(
    translation: str, file_name: str, output_directory: pathlib.Path
) -> pathlib.Path:
    # Prepare the file full path (checking for no duplications):
    translation_file = output_directory / f"{file_name}.txt"
    i = 1
    while translation_file.exists():
        i += 1
        translation_file = output_directory / f"{file_name}_{i}.txt"

    # Make sure all directories are created:
    translation_file.parent.mkdir(exist_ok=True, parents=True)

    # Write to file:
    with open(translation_file, "w") as fp:
        fp.write(translation)

    return translation_file
 + origin_filename: '' + base_image: mlrun/mlrun requirements: - transformers - sentencepiece - - torch + - torch>=2.6 - tqdm code_origin: '' - functionSourceCode: # Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import operator
import pathlib
from functools import reduce, wraps
from typing import Any, Dict, List, Tuple, Union

import pandas as pd
import transformers
from tqdm import tqdm

# Get the global logger:
_LOGGER = logging.getLogger()


def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    is_mpi = False
    try:
        import mlrun

        context = mlrun.get_or_create_ctx(name="mlrun")
        is_mpi = context.labels.get("kind", "job") == "mpijob"

        if is_mpi:
            try:
                from mpi4py import MPI

                return context, MPI.COMM_WORLD
            except ModuleNotFoundError as mpi4py_not_found:
                context.logger.error(
                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
                )
                raise mpi4py_not_found
        else:
            return context, None
    except ModuleNotFoundError as module_not_found:
        if is_mpi:
            raise module_not_found
    return None, None


def open_mpi_handler(
    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
):
    global _LOGGER

    # Check for MLRun and OpenMPI availability:
    context, comm = _check_mlrun_and_open_mpi()

    # Check if MLRun is available, set the global logger to MLRun's:
    if context:
        _LOGGER = context.logger

    def decorator(handler):
        if comm is None or comm.Get_size() == 1:
            return handler

        @wraps(handler)
        def wrapper(**kwargs):
            # Get the open mpi environment properties:
            size = comm.Get_size()
            rank = comm.Get_rank()

            # Give the correct chunk of the workers inputs:
            for worker_input in worker_inputs:
                input_argument = kwargs[worker_input]
                if input_argument is None:
                    continue
                if isinstance(input_argument, (str, pathlib.Path)):
                    input_argument = _get_text_files(
                        data_path=pathlib.Path(input_argument).absolute()
                    )
                if len(input_argument) < size:
                    raise ValueError(
                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
                        f"Please reduce the amount of workers for this input."
                    )
                even_chunk_size = len(input_argument) // size
                chunk_start = rank * even_chunk_size
                chunk_end = (
                    (rank + 1) * even_chunk_size
                    if rank + 1 < size
                    else len(input_argument)
                )
                context.logger.info(
                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
                    f"from index {chunk_start} to {chunk_end}."
                )
                if isinstance(input_argument, list):
                    input_argument = input_argument[chunk_start:chunk_end]
                elif isinstance(input_argument, pd.DataFrame):
                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
                kwargs[worker_input] = input_argument

            # Set the root worker only arguments:
            if rank == 0 and root_worker_inputs:
                kwargs.update(root_worker_inputs)

            # Run the worker:
            output = handler(**kwargs)

            # Send the output to the root rank (rank #0):
            output = comm.gather(output, root=0)
            if rank == 0:
                # Join the outputs:
                context.logger.info("Collecting data from workers to root worker.")
                output_directory = output[0][0]
                dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0)
                errors_dictionary = reduce(
                    operator.ior, [err for _, _, err in output], {}
                )
                return output_directory, dataframe, errors_dictionary
            return None

        return wrapper

    return decorator


@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
def translate(
    data_path: Union[str, List[str], pathlib.Path],
    output_directory: str,
    model_name: str = None,
    source_language: str = None,
    target_language: str = None,
    device: str = None,
    model_kwargs: dict = None,
    batch_size: int = 1,
    translation_kwargs: dict = None,
    verbose: bool = False,
) -> Tuple[str, pd.DataFrame, dict]:
    """
    Translate text files using a transformer model from Huggingface's hub according to the source and target languages
    given (or using the directly provided model name). The end result is a directory of translated text files and a
    dataframe containing the following columns:

    * text_file - The text file path.
    * translation_file - The translation text file name in the output directory.

    :param data_path:          A directory of text files or a single file or a list of files to translate.
    :param output_directory:   Directory where the translated files will be saved.
    :param model_name:         The name of a model to load. If None, the model name is constructed using the source and
                               target languages parameters.
    :param source_language:    The source language code (e.g., 'en' for English).
    :param target_language:    The target language code (e.g., 'en' for English).
    :param model_kwargs:       Keyword arguments to pass regarding the loading of the model in HuggingFace's `pipeline`
                               function.
    :param device:             The device index for transformers. Default will prefer cuda if available.
    :param batch_size:         The number of batches to use in translation. The files are translated one by one, but the
                               sentences can be batched.
    :param translation_kwargs: Additional keyword arguments to pass to a `transformers.TranslationPipeline` when doing
                               the translation inference. Notice the batch size here is being added automatically.
    :param verbose:            Whether to present logs of a progress bar and errors. Default: True.

    :returns: A tuple of:

              * Path to the output directory.
              * A dataframe dataset of the translated file names.
              * A dictionary of errored files that were not translated.
    """
    global _LOGGER

    # Get the input text files to translate:
    if verbose:
        _LOGGER.info("Collecting text files.")
    if isinstance(data_path, str):
        data_path = pathlib.Path(data_path).absolute()
        text_files = _get_text_files(data_path=data_path)
    else:
        text_files = data_path
    if verbose:
        _LOGGER.info(f"Collected {len(text_files)} text files.")

    # Get the translation pipeline:
    if verbose:
        _LOGGER.info(f"Loading model - using device '{device}'.")
    translation_pipeline, model_name = _get_translation_pipeline(
        model_name=model_name,
        source_language=source_language,
        target_language=target_language,
        device=device,
        model_kwargs=model_kwargs,
        batch_size=batch_size if batch_size != 1 else None,
    )
    if verbose:
        _LOGGER.info(f"Model '{model_name}' was loaded successfully.")

    # Prepare the successes dataframe and errors dictionary to be returned:
    successes = []
    errors = {}

    # Create the output directory:
    output_directory = pathlib.Path(output_directory)
    output_directory.mkdir(parents=True, exist_ok=True)

    # Prepare the translation keyword arguments:
    translation_kwargs = translation_kwargs or {}

    # Go over the audio files and transcribe:
    for text_file in tqdm(
        text_files, desc="Translating", unit="file", disable=not verbose
    ):
        try:
            # Translate:
            translation = _translate(
                text_file=text_file,
                translation_pipeline=translation_pipeline,
                translation_kwargs=translation_kwargs,
            )
            # Write the transcription to file:
            translation_file = _save_to_file(
                translation=translation,
                file_name=text_file.stem,
                output_directory=output_directory,
            )
            # Note as a success in the list:
            successes.append(
                [
                    text_file.name,
                    translation_file.name,
                ]
            )
        except Exception as exception:
            # Note the exception as error in the dictionary:
            if verbose:
                _LOGGER.warning(f"Error in file: '{text_file.name}'")
            errors[str(text_file.name)] = str(exception)
            continue

    # Construct the translations dataframe:
    columns = [
        "text_file",
        "translation_file",
    ]
    successes = pd.DataFrame(
        successes,
        columns=columns,
    )

    # Print the head of the produced dataframe and return:
    if verbose:
        _LOGGER.info(
            f"Done ({successes.shape[0]}/{len(text_files)})\n"
            f"Translations summary:\n"
            f"{successes.head()}"
        )
    return str(output_directory), successes, errors


def _get_text_files(
    data_path: pathlib.Path,
) -> List[pathlib.Path]:
    # Check if the path is of a directory or a file:
    if data_path.is_dir():
        # Get all files inside the directory:
        text_files = list(data_path.glob("*.*"))
    elif data_path.is_file():
        text_files = [data_path]
    else:
        raise ValueError(
            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
            f"Given: {str(data_path)} "
        )

    return text_files


def _get_translation_pipeline(
    model_name: str = None,
    source_language: str = None,
    target_language: str = None,
    device: str = None,
    model_kwargs: dict = None,
    batch_size: int = None,
) -> Tuple[transformers.Pipeline, str]:
    # Construct the model name - if model name is provided (not None) then we take it, otherwise we check both source
    # and target were provided to construct the model name:
    if model_name is None and (source_language is None or target_language is None):
        raise ValueError(
            "No model name were given and missing source and / or target languages. In order to translate you must "
            "pass a `model_name` or both `source_language` and `target_language`."
        )
    elif model_name is None:
        model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"

    # Initialize the translation pipeline:
    try:
        translation_pipeline = transformers.pipeline(
            task="translation",
            model=model_name,
            tokenizer=model_name,
            device=device,
            model_kwargs=model_kwargs,
            batch_size=batch_size,
        )
    except OSError as load_exception:
        if (
            "is not a valid model identifier listed on 'https://huggingface.co/models'"
            in str(load_exception)
            and source_language
        ):
            raise ValueError(
                f"The model '{model_name}' is not a valid model identifier. "
                f"The parameters `source_language` and `target_language` are used to construct a Helsinki model for "
                f"text to text generation, but the model created from the given languages does not exist. "
                f"You may check language identifiers at "
                f"https://developers.google.com/admin-sdk/directory/v1/languages, and if the error was not fixed, one "
                f"or more language code might be with 3 letters and needs to be found online. "
                f"Remember, you can always choose a model directly from the Huggingface hub by using the `model_name` "
                f"parameter."
            ) from load_exception
        raise load_exception

    return translation_pipeline, model_name


def _translate(
    text_file: pathlib.Path,
    translation_pipeline: transformers.Pipeline,
    translation_kwargs: dict,
) -> str:
    # Read the text from file:
    with open(text_file, "r") as fp:
        text = fp.read()

    # Split to paragraphs and each paragraph to sentences:
    paragraphs = [paragraph.split(".") for paragraph in text.split("\n")]

    # Discover the newline indexes to restore the file to its structure post translation:
    newlines_indexes = []
    for paragraph in paragraphs[:-1]:
        if len(newlines_indexes) == 0:
            newlines_indexes.append(len(paragraph) - 1)
        else:
            newlines_indexes.append(newlines_indexes[-1] + len(paragraph))

    # Prepare the batches (each sentence from the paragraphs). Notice we add a dot not only to restore the sentence
    # structure but to ignore empty strings as it will ruin the translation:
    sentences = [f"{line}." for paragraph in paragraphs for line in paragraph]

    # Translate the sentences:
    translations = translation_pipeline(sentences, **translation_kwargs)

    # Restructure the full text from the sentences:
    translated_text = []
    newline_index = newlines_indexes.pop(0) if newlines_indexes else None
    for i, translation in enumerate(translations):
        # Get the translation:
        text = translation["translation_text"]
        # Validate if it was an empty sentence before:
        if text == ".":
            text = ""
        # Check if needed to insert a newline:
        if newline_index and newline_index == i:
            text += "\n"
            newline_index = newlines_indexes.pop(0) if newlines_indexes else None
        # Collect it:
        translated_text.append(text)
    translated_text = "".join(translated_text)

    return translated_text


def _save_to_file(
    translation: str, file_name: str, output_directory: pathlib.Path
) -> pathlib.Path:
    # Prepare the file full path (checking for no duplications):
    translation_file = output_directory / f"{file_name}.txt"
    i = 1
    while translation_file.exists():
        i += 1
        translation_file = output_directory / f"{file_name}_{i}.txt"

    # Make sure all directories are created:
    translation_file.parent.mkdir(exist_ok=True, parents=True)

    # Write to file:
    with open(translation_file, "w") as fp:
        fp.write(translation)

    return translation_file
 - base_image: mlrun/mlrun - origin_filename: '' - image: '' - default_handler: translate - disable_auto_mount: false - command: '' - description: Translate text files from one language to another -verbose: false +kind: job metadata: + tag: '' categories: - genai - NLP - tag: '' name: translate -kind: job diff --git a/functions/src/translate/item.yaml b/functions/src/translate/item.yaml index eb0e821e4..68f176ac2 100644 --- a/functions/src/translate/item.yaml +++ b/functions/src/translate/item.yaml @@ -12,7 +12,7 @@ labels: author: Iguazio maintainers: [] marketplaceType: '' -mlrunVersion: 1.7.0 +mlrunVersion: 1.10.0-rc41 name: translate platformVersion: 3.5.3 spec: @@ -23,8 +23,8 @@ spec: requirements: - transformers - sentencepiece - - torch + - torch>=2.6 - tqdm url: '' -version: 0.2.0 +version: 0.3.0 test_valid: True diff --git a/functions/src/translate/requirements.txt b/functions/src/translate/requirements.txt index 94e548463..746da576c 100644 --- a/functions/src/translate/requirements.txt +++ b/functions/src/translate/requirements.txt @@ -1,4 +1,4 @@ transformers tqdm -torch +torch>=2.6 sentencepiece \ No newline at end of file From f2ec9318edb32abf60aa4492f0b613fed7a93ddd Mon Sep 17 00:00:00 2001 From: Daniel Perez <100069700+danielperezz@users.noreply.github.com> Date: Mon, 17 Nov 2025 14:27:25 +0200 Subject: [PATCH 11/17] [CLI] Generated READMEs are produced with broken links to the items (#918) * fix * test fix * test fix * test fix * test fix * final workflow --- .github/workflows/test-all.yaml | 8 +++++++- cli/README.md | 4 ++-- cli/common/update_readme.py | 17 ++++++++++------- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test-all.yaml b/.github/workflows/test-all.yaml index 162804863..d8eb6c6ed 100644 --- a/.github/workflows/test-all.yaml +++ b/.github/workflows/test-all.yaml @@ -116,6 +116,10 @@ jobs: permissions: contents: write steps: + - name: Get the current branch name + shell: bash + run: echo "branch=${GITHUB_REF#refs/heads/}" >> $GITHUB_OUTPUT + id: branch - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -128,7 +132,9 @@ jobs: pip install --upgrade pip pip install -r requirements.txt - name: Regenerate README tables - run: python -m cli.cli update-readme --asset functions --asset modules + env: + CHANNEL: ${{ steps.branch.outputs.branch }} + run: python -m cli.cli update-readme -c $CHANNEL --asset functions --asset modules - name: Commit & push (if changed) env: USERNAME: ${{ secrets.USERNAME }} diff --git a/cli/README.md b/cli/README.md index 4a3cd3bfc..31443b132 100644 --- a/cli/README.md +++ b/cli/README.md @@ -60,7 +60,7 @@ Example: Regenerate the `README.md` files in each of the asset directories (functions/modules). Usage: - `python -m cli.cli update-readme --asset TYPE` + `python -m cli.cli update-readme -c CHANNEL --asset TYPE` Example: - `python -m cli.cli update-readme --asset functions --asset modules` \ No newline at end of file + `python -m cli.cli update-readme -c master --asset functions --asset modules` \ No newline at end of file diff --git a/cli/common/update_readme.py b/cli/common/update_readme.py index 6bcab8d33..89b6aa094 100644 --- a/cli/common/update_readme.py +++ b/cli/common/update_readme.py @@ -25,6 +25,7 @@ COLUMNS = ("Name", "Description", "Kind", "Categories") @click.command("update-readme") +@click.option("-c", "--channel", default="master", help="Name of build channel") @click.option( "--asset", multiple=True, @@ -34,7 +35,7 @@ ) @click.option("--check", is_flag=True, help="Do not write; exit non‑zero if README(s) would change.") -def update_readme(asset: Iterable[str], +def update_readme(channel: str, asset: Iterable[str], check: bool) -> None: """ Regenerate the README tables for asset types from their item.yaml files. @@ -50,7 +51,7 @@ def update_readme(asset: Iterable[str], root = Path(".").resolve() asset_dir = root / t readme = asset_dir / "README.md" - rows = _rows_for_asset_type(asset_dir) + rows = _rows_for_asset_type(channel, asset_dir) table_md = _build_table_md(rows) old = readme.read_text() if readme.exists() else f"# {t.title()}\n\n" new = _replace_block(old, table_md) @@ -58,7 +59,7 @@ def update_readme(asset: Iterable[str], changed_any = True touched.append(str(readme)) else: - if _update_one(t): + if _update_one(channel, t): changed_any = True touched.append(str((Path(t) / "README.md").as_posix())) @@ -78,7 +79,7 @@ def update_readme(asset: Iterable[str], click.echo("No README changes.") -def _rows_for_asset_type(asset_dir: Path) -> List[Tuple[str, str, str, str]]: +def _rows_for_asset_type(channel: str, asset_dir: Path) -> List[Tuple[str, str, str, str]]: """Scan /src/*/item.yaml and return table rows.""" src = asset_dir / "src" if not src.exists(): @@ -97,7 +98,9 @@ def _rows_for_asset_type(asset_dir: Path) -> List[Tuple[str, str, str, str]]: cats = data.get("categories") or [] cats_str = ", ".join(c.strip() for c in cats) if isinstance(cats, list) else str(cats).strip() # Link the name to its source directory - link = f"[{asset_name}]({(asset_dir / 'src' / asset_name).as_posix()})" + # Construct the relative path from the repo root for the asset + rel_path = asset_dir.relative_to(Path(".").resolve()) + link = f"[{asset_name}](https://github.com/mlrun/functions/tree/{channel}/{rel_path}/src/{asset_name})" rows.append((link, desc, kind, cats_str)) rows.sort(key=lambda r: r[0].lower()) @@ -140,13 +143,13 @@ def _replace_block(readme_text: str, new_block: str) -> str: return readme_text[:start_close] + "\n" + new_block + "\n" + readme_text[ei:] -def _update_one(asset_type: str) -> bool: +def _update_one(channel: str, asset_type: str) -> bool: """Generate/replace the table in /README.md. Return True if changed.""" root = Path(".").resolve() asset_dir = root / asset_type readme = asset_dir / "README.md" - rows = _rows_for_asset_type(asset_dir) + rows = _rows_for_asset_type(channel, asset_dir) table_md = _build_table_md(rows) old = readme.read_text() if readme.exists() else f"# {asset_type.title()}\n\n" new = _replace_block(old, table_md) From 5c013ba18d25e6a840874575ac4aa71212e16397 Mon Sep 17 00:00:00 2001 From: iguazio-cicd Date: Mon, 17 Nov 2025 12:31:34 +0000 Subject: [PATCH 12/17] chore(readme): auto-update asset tables [skip ci] --- functions/README.md | 72 ++++++++++++++++++++++----------------------- modules/README.md | 6 ++-- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/functions/README.md b/functions/README.md index 08b1c7ad9..3618833a5 100644 --- a/functions/README.md +++ b/functions/README.md @@ -9,40 +9,40 @@ it is expected that contributors follow certain guidelines/protocols (please chi | Name | Description | Kind | Categories | | --- | --- | --- | --- | -| [aggregate](/home/runner/work/functions/functions/functions/src/aggregate) | Rolling aggregation over Metrics and Lables according to specifications | job | data-preparation | -| [arc_to_parquet](/home/runner/work/functions/functions/functions/src/arc_to_parquet) | retrieve remote archive, open and save as parquet | job | utils | -| [auto_trainer](/home/runner/work/functions/functions/functions/src/auto_trainer) | Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM. | job | machine-learning, model-training | -| [azureml_serving](/home/runner/work/functions/functions/functions/src/azureml_serving) | AzureML serving function | serving | machine-learning, model-serving | -| [azureml_utils](/home/runner/work/functions/functions/functions/src/azureml_utils) | Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom. | job | model-serving, utils | -| [batch_inference](/home/runner/work/functions/functions/functions/src/batch_inference) | Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. | job | model-serving | -| [batch_inference_v2](/home/runner/work/functions/functions/functions/src/batch_inference_v2) | Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. | job | model-serving | -| [describe](/home/runner/work/functions/functions/functions/src/describe) | describe and visualizes dataset stats | job | data-analysis | -| [describe_dask](/home/runner/work/functions/functions/functions/src/describe_dask) | describe and visualizes dataset stats | job | data-analysis | -| [describe_spark](/home/runner/work/functions/functions/functions/src/describe_spark) | | job | data-analysis | -| [feature_selection](/home/runner/work/functions/functions/functions/src/feature_selection) | Select features through multiple Statistical and Model filters | job | data-preparation, machine-learning | -| [gen_class_data](/home/runner/work/functions/functions/functions/src/gen_class_data) | Create a binary classification sample dataset and save. | job | data-generation | -| [github_utils](/home/runner/work/functions/functions/functions/src/github_utils) | add comments to github pull request | job | utils | -| [hugging_face_serving](/home/runner/work/functions/functions/functions/src/hugging_face_serving) | Generic Hugging Face model server. | serving | genai, model-serving | -| [load_dataset](/home/runner/work/functions/functions/functions/src/load_dataset) | load a toy dataset from scikit-learn | job | data-preparation | -| [mlflow_utils](/home/runner/work/functions/functions/functions/src/mlflow_utils) | Mlflow model server, and additional utils. | serving | model-serving, utils | -| [model_server](/home/runner/work/functions/functions/functions/src/model_server) | generic sklearn model server | nuclio:serving | model-serving, machine-learning | -| [model_server_tester](/home/runner/work/functions/functions/functions/src/model_server_tester) | test model servers | job | monitoring, model-serving | -| [noise_reduction](/home/runner/work/functions/functions/functions/src/noise_reduction) | Reduce noise from audio files | job | data-preparation, audio | -| [onnx_utils](/home/runner/work/functions/functions/functions/src/onnx_utils) | ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun. | job | utils, deep-learning | -| [open_archive](/home/runner/work/functions/functions/functions/src/open_archive) | Open a file/object archive into a target directory | job | utils | -| [pii_recognizer](/home/runner/work/functions/functions/functions/src/pii_recognizer) | This function is used to recognize PII in a directory of text files | job | data-preparation, NLP | -| [pyannote_audio](/home/runner/work/functions/functions/functions/src/pyannote_audio) | pyannote's speech diarization of audio files | job | deep-learning, audio | -| [question_answering](/home/runner/work/functions/functions/functions/src/question_answering) | GenAI approach of question answering on a given data | job | genai | -| [send_email](/home/runner/work/functions/functions/functions/src/send_email) | Send Email messages through SMTP server | job | utils | -| [silero_vad](/home/runner/work/functions/functions/functions/src/silero_vad) | Silero VAD (Voice Activity Detection) functions. | job | deep-learning, audio | -| [sklearn_classifier](/home/runner/work/functions/functions/functions/src/sklearn_classifier) | train any classifier using scikit-learn's API | job | machine-learning, model-training | -| [sklearn_classifier_dask](/home/runner/work/functions/functions/functions/src/sklearn_classifier_dask) | train any classifier using scikit-learn's API over Dask | job | machine-learning, model-training | -| [structured_data_generator](/home/runner/work/functions/functions/functions/src/structured_data_generator) | GenAI approach of generating structured data according to a given schema | job | data-generation, genai | -| [test_classifier](/home/runner/work/functions/functions/functions/src/test_classifier) | test a classifier using held-out or new data | job | machine-learning, model-testing | -| [text_to_audio_generator](/home/runner/work/functions/functions/functions/src/text_to_audio_generator) | Generate audio file from text using different speakers | job | data-generation, audio | -| [tf2_serving](/home/runner/work/functions/functions/functions/src/tf2_serving) | tf2 image classification server | nuclio:serving | model-serving, machine-learning | -| [transcribe](/home/runner/work/functions/functions/functions/src/transcribe) | Transcribe audio files into text files | job | audio, genai | -| [translate](/home/runner/work/functions/functions/functions/src/translate) | Translate text files from one language to another | job | genai, NLP | -| [v2_model_server](/home/runner/work/functions/functions/functions/src/v2_model_server) | generic sklearn model server | serving | model-serving, machine-learning | -| [v2_model_tester](/home/runner/work/functions/functions/functions/src/v2_model_tester) | test v2 model servers | job | model-testing, machine-learning | +| [aggregate](https://github.com/mlrun/functions/tree/development/functions/src/aggregate) | Rolling aggregation over Metrics and Lables according to specifications | job | data-preparation | +| [arc_to_parquet](https://github.com/mlrun/functions/tree/development/functions/src/arc_to_parquet) | retrieve remote archive, open and save as parquet | job | utils | +| [auto_trainer](https://github.com/mlrun/functions/tree/development/functions/src/auto_trainer) | Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM. | job | machine-learning, model-training | +| [azureml_serving](https://github.com/mlrun/functions/tree/development/functions/src/azureml_serving) | AzureML serving function | serving | machine-learning, model-serving | +| [azureml_utils](https://github.com/mlrun/functions/tree/development/functions/src/azureml_utils) | Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom. | job | model-serving, utils | +| [batch_inference](https://github.com/mlrun/functions/tree/development/functions/src/batch_inference) | Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. | job | model-serving | +| [batch_inference_v2](https://github.com/mlrun/functions/tree/development/functions/src/batch_inference_v2) | Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. | job | model-serving | +| [describe](https://github.com/mlrun/functions/tree/development/functions/src/describe) | describe and visualizes dataset stats | job | data-analysis | +| [describe_dask](https://github.com/mlrun/functions/tree/development/functions/src/describe_dask) | describe and visualizes dataset stats | job | data-analysis | +| [describe_spark](https://github.com/mlrun/functions/tree/development/functions/src/describe_spark) | | job | data-analysis | +| [feature_selection](https://github.com/mlrun/functions/tree/development/functions/src/feature_selection) | Select features through multiple Statistical and Model filters | job | data-preparation, machine-learning | +| [gen_class_data](https://github.com/mlrun/functions/tree/development/functions/src/gen_class_data) | Create a binary classification sample dataset and save. | job | data-generation | +| [github_utils](https://github.com/mlrun/functions/tree/development/functions/src/github_utils) | add comments to github pull request | job | utils | +| [hugging_face_serving](https://github.com/mlrun/functions/tree/development/functions/src/hugging_face_serving) | Generic Hugging Face model server. | serving | genai, model-serving | +| [load_dataset](https://github.com/mlrun/functions/tree/development/functions/src/load_dataset) | load a toy dataset from scikit-learn | job | data-preparation | +| [mlflow_utils](https://github.com/mlrun/functions/tree/development/functions/src/mlflow_utils) | Mlflow model server, and additional utils. | serving | model-serving, utils | +| [model_server](https://github.com/mlrun/functions/tree/development/functions/src/model_server) | generic sklearn model server | nuclio:serving | model-serving, machine-learning | +| [model_server_tester](https://github.com/mlrun/functions/tree/development/functions/src/model_server_tester) | test model servers | job | monitoring, model-serving | +| [noise_reduction](https://github.com/mlrun/functions/tree/development/functions/src/noise_reduction) | Reduce noise from audio files | job | data-preparation, audio | +| [onnx_utils](https://github.com/mlrun/functions/tree/development/functions/src/onnx_utils) | ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun. | job | utils, deep-learning | +| [open_archive](https://github.com/mlrun/functions/tree/development/functions/src/open_archive) | Open a file/object archive into a target directory | job | utils | +| [pii_recognizer](https://github.com/mlrun/functions/tree/development/functions/src/pii_recognizer) | This function is used to recognize PII in a directory of text files | job | data-preparation, NLP | +| [pyannote_audio](https://github.com/mlrun/functions/tree/development/functions/src/pyannote_audio) | pyannote's speech diarization of audio files | job | deep-learning, audio | +| [question_answering](https://github.com/mlrun/functions/tree/development/functions/src/question_answering) | GenAI approach of question answering on a given data | job | genai | +| [send_email](https://github.com/mlrun/functions/tree/development/functions/src/send_email) | Send Email messages through SMTP server | job | utils | +| [silero_vad](https://github.com/mlrun/functions/tree/development/functions/src/silero_vad) | Silero VAD (Voice Activity Detection) functions. | job | deep-learning, audio | +| [sklearn_classifier](https://github.com/mlrun/functions/tree/development/functions/src/sklearn_classifier) | train any classifier using scikit-learn's API | job | machine-learning, model-training | +| [sklearn_classifier_dask](https://github.com/mlrun/functions/tree/development/functions/src/sklearn_classifier_dask) | train any classifier using scikit-learn's API over Dask | job | machine-learning, model-training | +| [structured_data_generator](https://github.com/mlrun/functions/tree/development/functions/src/structured_data_generator) | GenAI approach of generating structured data according to a given schema | job | data-generation, genai | +| [test_classifier](https://github.com/mlrun/functions/tree/development/functions/src/test_classifier) | test a classifier using held-out or new data | job | machine-learning, model-testing | +| [text_to_audio_generator](https://github.com/mlrun/functions/tree/development/functions/src/text_to_audio_generator) | Generate audio file from text using different speakers | job | data-generation, audio | +| [tf2_serving](https://github.com/mlrun/functions/tree/development/functions/src/tf2_serving) | tf2 image classification server | nuclio:serving | model-serving, machine-learning | +| [transcribe](https://github.com/mlrun/functions/tree/development/functions/src/transcribe) | Transcribe audio files into text files | job | audio, genai | +| [translate](https://github.com/mlrun/functions/tree/development/functions/src/translate) | Translate text files from one language to another | job | genai, NLP | +| [v2_model_server](https://github.com/mlrun/functions/tree/development/functions/src/v2_model_server) | generic sklearn model server | serving | model-serving, machine-learning | +| [v2_model_tester](https://github.com/mlrun/functions/tree/development/functions/src/v2_model_tester) | test v2 model servers | job | model-testing, machine-learning | diff --git a/modules/README.md b/modules/README.md index f49c8472f..38cb474d3 100644 --- a/modules/README.md +++ b/modules/README.md @@ -6,7 +6,7 @@ | Name | Description | Kind | Categories | | --- | --- | --- | --- | -| [count_events](/home/runner/work/functions/functions/modules/src/count_events) | Count events in each time window | monitoring_application | model-serving | -| [evidently](/home/runner/work/functions/functions/modules/src/evidently) | Demonstrates Evidently integration in MLRun for data quality and drift monitoring using the Iris dataset | monitoring_application | model-serving, structured-ML | -| [histogram_data_drift](/home/runner/work/functions/functions/modules/src/histogram_data_drift) | Model-monitoring application for detecting and visualizing data drift | monitoring_application | model-serving, structured-ML | +| [count_events](https://github.com/mlrun/functions/tree/development/modules/src/count_events) | Count events in each time window | monitoring_application | model-serving | +| [evidently](https://github.com/mlrun/functions/tree/development/modules/src/evidently) | Demonstrates Evidently integration in MLRun for data quality and drift monitoring using the Iris dataset | monitoring_application | model-serving, structured-ML | +| [histogram_data_drift](https://github.com/mlrun/functions/tree/development/modules/src/histogram_data_drift) | Model-monitoring application for detecting and visualizing data drift | monitoring_application | model-serving, structured-ML | From 2f3397477cd0ebdf24bee0e63a0e82db761ed9c5 Mon Sep 17 00:00:00 2001 From: guylei-code Date: Mon, 17 Nov 2025 15:06:43 +0200 Subject: [PATCH 13/17] OpenAI Module without notebook (#917) * First commit OpenAI Module * First commit OpenAI Module * Update example filename in item.yaml * Delete modules/src/openai_proxy/requirements.txt No need due to no unitest * Update item.yaml for OpenAI application configuration * Update modules/src/openai_proxy/openai.py Co-authored-by: Daniel Perez <100069700+danielperezz@users.noreply.github.com> * Change category name from 'GenAI' to 'genai' * Update package requirements with version constraints * Second commit adding notebook * Refactor OpenAI proxy to use base64 encoded script Refactor OpenAI proxy implementation to use base64 encoded script and update FastAPI app configuration. * Change deployment method to OpenAIModule * Third commit adding notebook * Third commit adding notebook * Remove package requirements from item.yaml Removed specific requirements for fastapi and requests. * Rename item and update kind in YAML * Update openai.py * Third commit adding notebook * Fix after review * Fix after review --------- Co-authored-by: Daniel Perez <100069700+danielperezz@users.noreply.github.com> --- modules/src/openai_proxy_app/item.yaml | 19 +++++ .../openai_proxy_app/openai_proxy_app.ipynb | 72 +++++++++++++++++++ .../src/openai_proxy_app/openai_proxy_app.py | 56 +++++++++++++++ 3 files changed, 147 insertions(+) create mode 100644 modules/src/openai_proxy_app/item.yaml create mode 100644 modules/src/openai_proxy_app/openai_proxy_app.ipynb create mode 100644 modules/src/openai_proxy_app/openai_proxy_app.py diff --git a/modules/src/openai_proxy_app/item.yaml b/modules/src/openai_proxy_app/item.yaml new file mode 100644 index 000000000..bf295cf2a --- /dev/null +++ b/modules/src/openai_proxy_app/item.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +categories: +- genai +description: OpenAI application runtime based on fastapi +example: openai_proxy_app.ipynb +generationDate: 2025-11-11:12-25 +hidden: false +labels: + author: Iguazio +mlrunVersion: 1.10.0 +name: openai_proxy_app +spec: + filename: openai_proxy_app.py + image: mlrun/mlrun + requirements: + - fastapi>=0.110,<1.0 + - requests>=2.31,<3.0 + kind: generic +version: 1.0.0 diff --git a/modules/src/openai_proxy_app/openai_proxy_app.ipynb b/modules/src/openai_proxy_app/openai_proxy_app.ipynb new file mode 100644 index 000000000..123934fbd --- /dev/null +++ b/modules/src/openai_proxy_app/openai_proxy_app.ipynb @@ -0,0 +1,72 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "220629c8-17aa-45f6-ac81-0ca31e165412", + "metadata": {}, + "source": [ + "# OpenAI Module Demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "967b4d5d-7250-40bf-8149-de11e1e3244c", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17d208f4-a00a-42ef-a849-0fa79bed10cb", + "metadata": {}, + "outputs": [], + "source": [ + "project = mlrun.get_or_create_project(\"fastapi-openai\",user_project=True,context=\"./src\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67c93a0d-8240-48b8-808e-9cd0af418309", + "metadata": {}, + "outputs": [], + "source": [ + "app = mlrun.import_module(\"hub://openai\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93e67d6a-5f53-4bda-b0b5-4e2977088139", + "metadata": {}, + "outputs": [], + "source": "app.OpenAIModule.deploy()" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/modules/src/openai_proxy_app/openai_proxy_app.py b/modules/src/openai_proxy_app/openai_proxy_app.py new file mode 100644 index 000000000..a0e9df7ac --- /dev/null +++ b/modules/src/openai_proxy_app/openai_proxy_app.py @@ -0,0 +1,56 @@ +# Copyright 2025 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +#This module acts as a lightweight gateway to OpenAI-compatible APIs. +#You can send chat prompts, create embeddings, or get model responses without worrying about authentication or endpoint differences. +#It simplifies access so you can test, analyze, or integrate AI features directly into your projects or notebooks with minimal setup. + + +BASE64 = "IyBvcGVuYWlfcHJveHkvb3BlbmFpLnB5CgppbXBvcnQgb3MKaW1wb3J0IGpzb24KZnJvbSB1cmxsaWIucGFyc2UgaW1wb3J0IHVybGpvaW4KZnJvbSB0eXBpbmcgaW1wb3J0IEFueSwgRGljdCwgTGlzdCwgT3B0aW9uYWwKCmltcG9ydCByZXF1ZXN0cwpmcm9tIGZhc3RhcGkgaW1wb3J0IEZhc3RBUEksIFJlcXVlc3QsIFJlc3BvbnNlLCBCb2R5CgphcHAgPSBGYXN0QVBJKAogICAgdGl0bGU9Ik9wZW5BSSBQcm94eSBBcHAiLAogICAgZGVzY3JpcHRpb249IkxvY2FsIEZhc3RBUEkgcHJveHkgZm9yIE9wZW5BSSBzdHlsZSBlbmRwb2ludHMiLAogICAgdmVyc2lvbj0iMS4wLjAiLAopCgpPUEVOQUlfQkFTRV9VUkwgPSBvcy5nZXRlbnYoIk9QRU5BSV9CQVNFX1VSTCIsICJodHRwczovL2FwaS5vcGVuYWkuY29tIikucnN0cmlwKCIvIikKT1BFTkFJX0FQSV9LRVkgPSBvcy5nZXRlbnYoIk9QRU5BSV9BUElfS0VZIiwgIiIpCk9QRU5BSV9ERUZBVUxUX01PREVMID0gb3MuZ2V0ZW52KCJPUEVOQUlfREVGQVVMVF9NT0RFTCIsICJncHQtNG8tbWluaSIpCgoKZGVmIGJ1aWxkX2hlYWRlcnMoaW5jb21pbmc6IGRpY3QpIC0+IGRpY3Q6CiAgICBoZWFkZXJzID0ge30KICAgIGF1dGggPSBpbmNvbWluZy5nZXQoImF1dGhvcml6YXRpb24iKSBvciBpbmNvbWluZy5nZXQoIkF1dGhvcml6YXRpb24iKQogICAgaWYgYXV0aDoKICAgICAgICBoZWFkZXJzWyJBdXRob3JpemF0aW9uIl0gPSBhdXRoCiAgICBlbGlmIE9QRU5BSV9BUElfS0VZOgogICAgICAgIGhlYWRlcnNbIkF1dGhvcml6YXRpb24iXSA9IGYiQmVhcmVyIHtPUEVOQUlfQVBJX0tFWX0iCiAgICBjdHlwZSA9IGluY29taW5nLmdldCgiY29udGVudC10eXBlIikgb3IgaW5jb21pbmcuZ2V0KCJDb250ZW50LVR5cGUiKSBvciAiYXBwbGljYXRpb24vanNvbiIKICAgIGhlYWRlcnNbIkNvbnRlbnQtVHlwZSJdID0gY3R5cGUKICAgIHJldHVybiBoZWFkZXJzCgoKZGVmIGJ1aWxkX3RhcmdldChwYXRoOiBzdHIpIC0+IHN0cjoKICAgIGJhc2UgPSBPUEVOQUlfQkFTRV9VUkwKICAgIGlmIGJhc2UuZW5kc3dpdGgoIi92MSIpIG9yIGJhc2UuZW5kc3dpdGgoIi92MS8iKToKICAgICAgICBiYXNlID0gYmFzZVs6LTNdIGlmIGJhc2UuZW5kc3dpdGgoIi92MSIpIGVsc2UgYmFzZVs6LTRdCiAgICByZXR1cm4gdXJsam9pbihiYXNlICsgIi8iLCBwYXRoLmxzdHJpcCgiLyIpKQoKCmRlZiBmb3J3YXJkX2pzb24ocGF0aDogc3RyLCBib2R5OiBkaWN0LCBoZWFkZXJzOiBkaWN0LCBxdWVyeTogZGljdCk6CiAgICB0YXJnZXQgPSBidWlsZF90YXJnZXQocGF0aCkKICAgIHJlc3AgPSByZXF1ZXN0cy5wb3N0KAogICAgICAgIHRhcmdldCwKICAgICAgICBoZWFkZXJzPWhlYWRlcnMsCiAgICAgICAgcGFyYW1zPXF1ZXJ5LAogICAgICAgIGpzb249Ym9keSwKICAgICAgICB0aW1lb3V0PTYwLAogICAgKQogICAgcmV0dXJuIHJlc3AKCkBhcHAuZ2V0KCIvIikKZGVmIGhlYWx0aCgpOgogICAgcmV0dXJuIHsic3RhdHVzIjogIm9rIn0KCgojIHJlbGF4ZWQgY2hhdCBlbmRwb2ludCwgYWNjZXB0cyBhbnkgSlNPTiB0aGF0IGluY2x1ZGVzIG1lc3NhZ2VzCkBhcHAucG9zdCgiL3YxL2NoYXQvY29tcGxldGlvbnMiKQphc3luYyBkZWYgY2hhdF9jb21wbGV0aW9ucygKICAgIHJlcXVlc3Q6IFJlcXVlc3QsCiAgICBwYXlsb2FkOiBEaWN0W3N0ciwgQW55XSA9IEJvZHkoLi4uKSwKKToKICAgIGlmICJtZXNzYWdlcyIgbm90IGluIHBheWxvYWQgb3Igbm90IGlzaW5zdGFuY2UocGF5bG9hZFsibWVzc2FnZXMiXSwgbGlzdCk6CiAgICAgICAgcmV0dXJuIFJlc3BvbnNlKAogICAgICAgICAgICBjb250ZW50PWpzb24uZHVtcHMoeyJlcnJvciI6ICJtZXNzYWdlcyBtdXN0IGJlIGEgbGlzdCBvZiBjaGF0IG1lc3NhZ2VzIn0pLAogICAgICAgICAgICBzdGF0dXNfY29kZT00MDAsCiAgICAgICAgICAgIG1lZGlhX3R5cGU9ImFwcGxpY2F0aW9uL2pzb24iLAogICAgICAgICkKCiAgICBpZiAibW9kZWwiIG5vdCBpbiBwYXlsb2FkIG9yIHBheWxvYWRbIm1vZGVsIl0gaXMgTm9uZToKICAgICAgICBwYXlsb2FkWyJtb2RlbCJdID0gT1BFTkFJX0RFRkFVTFRfTU9ERUwKCiAgICBoZWFkZXJzID0gYnVpbGRfaGVhZGVycyhkaWN0KHJlcXVlc3QuaGVhZGVycykpCiAgICByZXNwID0gZm9yd2FyZF9qc29uKCIvdjEvY2hhdC9jb21wbGV0aW9ucyIsIHBheWxvYWQsIGhlYWRlcnMsIGRpY3QocmVxdWVzdC5xdWVyeV9wYXJhbXMpKQogICAgcmV0dXJuIFJlc3BvbnNlKAogICAgICAgIGNvbnRlbnQ9cmVzcC5jb250ZW50LAogICAgICAgIHN0YXR1c19jb2RlPXJlc3Auc3RhdHVzX2NvZGUsCiAgICAgICAgbWVkaWFfdHlwZT1yZXNwLmhlYWRlcnMuZ2V0KCJDb250ZW50LVR5cGUiLCAiYXBwbGljYXRpb24vanNvbiIpLAogICAgKQoKCkBhcHAucG9zdCgiL3YxL2VtYmVkZGluZ3MiKQphc3luYyBkZWYgZW1iZWRkaW5ncygKICAgIHJlcXVlc3Q6IFJlcXVlc3QsCiAgICBwYXlsb2FkOiBEaWN0W3N0ciwgQW55XSA9IEJvZHkoLi4uKSwKKToKICAgIGlmICJtb2RlbCIgbm90IGluIHBheWxvYWQgb3Igbm90IHBheWxvYWRbIm1vZGVsIl06CiAgICAgICAgcGF5bG9hZFsibW9kZWwiXSA9ICJ0ZXh0LWVtYmVkZGluZy0zLXNtYWxsIgogICAgaGVhZGVycyA9IGJ1aWxkX2hlYWRlcnMoZGljdChyZXF1ZXN0LmhlYWRlcnMpKQogICAgcmVzcCA9IGZvcndhcmRfanNvbigiL3YxL2VtYmVkZGluZ3MiLCBwYXlsb2FkLCBoZWFkZXJzLCBkaWN0KHJlcXVlc3QucXVlcnlfcGFyYW1zKSkKICAgIHJldHVybiBSZXNwb25zZSgKICAgICAgICBjb250ZW50PXJlc3AuY29udGVudCwKICAgICAgICBzdGF0dXNfY29kZT1yZXNwLnN0YXR1c19jb2RlLAogICAgICAgIG1lZGlhX3R5cGU9cmVzcC5oZWFkZXJzLmdldCgiQ29udGVudC1UeXBlIiwgImFwcGxpY2F0aW9uL2pzb24iKSwKICAgICkKCgpAYXBwLnBvc3QoIi92MS9yZXNwb25zZXMiKQphc3luYyBkZWYgcmVzcG9uc2VzX2FwaSgKICAgIHJlcXVlc3Q6IFJlcXVlc3QsCiAgICBwYXlsb2FkOiBEaWN0W3N0ciwgQW55XSA9IEJvZHkoLi4uKSwKKToKICAgIGlmICJtb2RlbCIgbm90IGluIHBheWxvYWQgb3IgcGF5bG9hZFsibW9kZWwiXSBpcyBOb25lOgogICAgICAgIHBheWxvYWRbIm1vZGVsIl0gPSBPUEVOQUlfREVGQVVMVF9NT0RFTAogICAgaGVhZGVycyA9IGJ1aWxkX2hlYWRlcnMoZGljdChyZXF1ZXN0LmhlYWRlcnMpKQogICAgcmVzcCA9IGZvcndhcmRfanNvbigiL3YxL3Jlc3BvbnNlcyIsIHBheWxvYWQsIGhlYWRlcnMsIGRpY3QocmVxdWVzdC5xdWVyeV9wYXJhbXMpKQogICAgcmV0dXJuIFJlc3BvbnNlKAogICAgICAgIGNvbnRlbnQ9cmVzcC5jb250ZW50LAogICAgICAgIHN0YXR1c19jb2RlPXJlc3Auc3RhdHVzX2NvZGUsCiAgICAgICAgbWVkaWFfdHlwZT1yZXNwLmhlYWRlcnMuZ2V0KCJDb250ZW50LVR5cGUiLCAiYXBwbGljYXRpb24vanNvbiIpLAogICAgKQoKCiMgLS0tLS0tLS0tLS0tLS0tLSBjbGllbnQgLS0tLS0tLS0tLS0tLS0tLQpjbGFzcyBPcGVuQUlQcm94eUNsaWVudDoKICAgICIiIgogICAgU2ltcGxlIGNsaWVudCBmb3IgdGhlIGxvY2FsIHByb3h5LgogICAgRGVmYXVsdCBiYXNlIHVybCBpcyBodHRwOi8vbG9jYWxob3N0OjgwMDAKICAgIElmIGFwaV9rZXkgaXMgbm90IHByb3ZpZGVkLCBpdCB1c2VzIE9QRU5BSV9BUElfS0VZIGZyb20gZW52aXJvbm1lbnQuCiAgICAiIiIKCiAgICBkZWYgX19pbml0X18oc2VsZiwgYmFzZV91cmw6IHN0ciA9ICJodHRwOi8vbG9jYWxob3N0OjgwMDAiLCBhcGlfa2V5OiBPcHRpb25hbFtzdHJdID0gTm9uZSk6CiAgICAgICAgc2VsZi5iYXNlX3VybCA9IGJhc2VfdXJsLnJzdHJpcCgiLyIpCiAgICAgICAgc2VsZi5hcGlfa2V5ID0gYXBpX2tleQoKICAgIGRlZiBfaGVhZGVycyhzZWxmKSAtPiBEaWN0W3N0ciwgc3RyXToKICAgICAgICBoZWFkZXJzID0geyJDb250ZW50LVR5cGUiOiAiYXBwbGljYXRpb24vanNvbiJ9CiAgICAgICAga2V5ID0gc2VsZi5hcGlfa2V5IG9yIG9zLmdldGVudigiT1BFTkFJX0FQSV9LRVkiLCAiIikKICAgICAgICBpZiBrZXk6CiAgICAgICAgICAgIGhlYWRlcnNbIkF1dGhvcml6YXRpb24iXSA9IGYiQmVhcmVyIHtrZXl9IgogICAgICAgIHJldHVybiBoZWFkZXJzCgogICAgZGVmIGNoYXQoc2VsZiwgbWVzc2FnZXM6IExpc3RbRGljdFtzdHIsIHN0cl1dLCBtb2RlbDogT3B0aW9uYWxbc3RyXSA9IE5vbmUpIC0+IERpY3Rbc3RyLCBBbnldOgogICAgICAgIGJvZHk6IERpY3Rbc3RyLCBBbnldID0geyJtZXNzYWdlcyI6IG1lc3NhZ2VzfQogICAgICAgIGlmIG1vZGVsOgogICAgICAgICAgICBib2R5WyJtb2RlbCJdID0gbW9kZWwKICAgICAgICByZXNwID0gcmVxdWVzdHMucG9zdCgKICAgICAgICAgICAgZiJ7c2VsZi5iYXNlX3VybH0vdjEvY2hhdC9jb21wbGV0aW9ucyIsCiAgICAgICAgICAgIGhlYWRlcnM9c2VsZi5faGVhZGVycygpLAogICAgICAgICAgICBqc29uPWJvZHksCiAgICAgICAgICAgIHRpbWVvdXQ9NjAsCiAgICAgICAgKQogICAgICAgIHJlc3AucmFpc2VfZm9yX3N0YXR1cygpCiAgICAgICAgcmV0dXJuIHJlc3AuanNvbigpCgogICAgZGVmIGVtYmVkZGluZ3Moc2VsZiwgdGV4dDogQW55LCBtb2RlbDogT3B0aW9uYWxbc3RyXSA9IE5vbmUpIC0+IERpY3Rbc3RyLCBBbnldOgogICAgICAgIGJvZHk6IERpY3Rbc3RyLCBBbnldID0geyJpbnB1dCI6IHRleHR9CiAgICAgICAgaWYgbW9kZWw6CiAgICAgICAgICAgIGJvZHlbIm1vZGVsIl0gPSBtb2RlbAogICAgICAgIHJlc3AgPSByZXF1ZXN0cy5wb3N0KAogICAgICAgICAgICBmIntzZWxmLmJhc2VfdXJsfS92MS9lbWJlZGRpbmdzIiwKICAgICAgICAgICAgaGVhZGVycz1zZWxmLl9oZWFkZXJzKCksCiAgICAgICAgICAgIGpzb249Ym9keSwKICAgICAgICAgICAgdGltZW91dD02MCwKICAgICAgICApCiAgICAgICAgcmVzcC5yYWlzZV9mb3Jfc3RhdHVzKCkKICAgICAgICByZXR1cm4gcmVzcC5qc29uKCkKCiAgICBkZWYgcmVzcG9uc2VzKHNlbGYsIGlucHV0X3RleHQ6IEFueSwgbW9kZWw6IE9wdGlvbmFsW3N0cl0gPSBOb25lKSAtPiBEaWN0W3N0ciwgQW55XToKICAgICAgICBib2R5OiBEaWN0W3N0ciwgQW55XSA9IHsiaW5wdXQiOiBpbnB1dF90ZXh0fQogICAgICAgIGlmIG1vZGVsOgogICAgICAgICAgICBib2R5WyJtb2RlbCJdID0gbW9kZWwKICAgICAgICByZXNwID0gcmVxdWVzdHMucG9zdCgKICAgICAgICAgICAgZiJ7c2VsZi5iYXNlX3VybH0vdjEvcmVzcG9uc2VzIiwKICAgICAgICAgICAgaGVhZGVycz1zZWxmLl9oZWFkZXJzKCksCiAgICAgICAgICAgIGpzb249Ym9keSwKICAgICAgICAgICAgdGltZW91dD02MCwKICAgICAgICApCiAgICAgICAgcmVzcC5yYWlzZV9mb3Jfc3RhdHVzKCkKICAgICAgICByZXR1cm4gcmVzcC5qc29uKCkKCgojIG9wdGlvbmFsIHF1aWNrIHNlbGYgdGVzdCB3aGVuIHJ1bm5pbmcgdGhpcyBmaWxlIGRpcmVjdGx5CmlmIF9fbmFtZV9fID09ICJfX21haW5fXyI6CiAgICAjIHN0YXJ0IHRoZSBzZXJ2ZXIgaW4gYW5vdGhlciB0ZXJtaW5hbCBmaXJzdDoKICAgICMgdXZpY29ybiBvcGVuYWlfcHJveHkub3BlbmFpOmFwcCAtLWhvc3QgMC4wLjAuMCAtLXBvcnQgODAwMCAtLXJlbG9hZAogICAgYyA9IE9wZW5BSVByb3h5Q2xpZW50KCkKICAgIHRyeToKICAgICAgICBwcmludCgiSGVhbHRoOiIsIHJlcXVlc3RzLmdldChmIntjLmJhc2VfdXJsfS8iKS5qc29uKCkpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgcHJpbnQoIlNlcnZlciBub3QgcnVubmluZzoiLCBlKQo=" +CMD = r''' +set -e +python - <<'PY' +import os, base64, pathlib +code = os.environ["BASE64"] +pathlib.Path("/opt/app").mkdir(parents=True, exist_ok=True) +with open("/opt/app/openai_proxy_app.py","wb") as f: + f.write(base64.b64decode(code)) +print("Wrote /opt/app/openai_proxy_app.py") +PY + +exec gunicorn openai:app \ + --chdir /opt/app \ + --bind 0.0.0.0:8000 \ + --worker-class uvicorn.workers.UvicornWorker \ + --log-level info +'''.strip() +class OpenAIModule: + def __init__(self,project): + self.project = project + self.fastapi_app = self.project.set_function(name="openai",kind="application",image="python:3.11") + self.fastapi_app.with_requirements([ + "fastapi>=0.110,<1.0", + "uvicorn[standard]>=0.29,<1.0", + "gunicorn>=21.2,<22.0", + "requests>=2.31,<3.0", + ]) + self.fastapi_app.set_env("BASE64",BASE64) + self.fastapi_app.set_internal_application_port(8000) + self.fastapi_app.spec.command = "/bin/sh" + self.fastapi_app.spec.args = ["-c", CMD] + + + + + From 277e11d32a969d32502517a3fa2876db1b391d41 Mon Sep 17 00:00:00 2001 From: iguazio-cicd Date: Mon, 17 Nov 2025 13:07:54 +0000 Subject: [PATCH 14/17] chore(readme): auto-update asset tables [skip ci] --- modules/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/README.md b/modules/README.md index 38cb474d3..79da6c416 100644 --- a/modules/README.md +++ b/modules/README.md @@ -9,4 +9,5 @@ | [count_events](https://github.com/mlrun/functions/tree/development/modules/src/count_events) | Count events in each time window | monitoring_application | model-serving | | [evidently](https://github.com/mlrun/functions/tree/development/modules/src/evidently) | Demonstrates Evidently integration in MLRun for data quality and drift monitoring using the Iris dataset | monitoring_application | model-serving, structured-ML | | [histogram_data_drift](https://github.com/mlrun/functions/tree/development/modules/src/histogram_data_drift) | Model-monitoring application for detecting and visualizing data drift | monitoring_application | model-serving, structured-ML | +| [openai_proxy_app](https://github.com/mlrun/functions/tree/development/modules/src/openai_proxy_app) | OpenAI application runtime based on fastapi | generic | genai | From 356cb3841990d37978cda2fc49592aaddd18f839 Mon Sep 17 00:00:00 2001 From: Daniel Perez <100069700+danielperezz@users.noreply.github.com> Date: Mon, 17 Nov 2025 15:51:52 +0200 Subject: [PATCH 15/17] [Evidently] Fill example notebook (#919) * add notebook + rename directory + correct evidently version * remove extra cell --- modules/src/evidently/evidently_iris.ipynb | 37 - .../src/evidently_iris/evidently_iris.ipynb | 1295 +++++++++++++++++ .../evidently_iris.py | 0 .../{evidently => evidently_iris}/item.yaml | 2 +- .../requirements.txt | 2 +- .../test_evidently_iris.py | 0 6 files changed, 1297 insertions(+), 39 deletions(-) delete mode 100644 modules/src/evidently/evidently_iris.ipynb create mode 100644 modules/src/evidently_iris/evidently_iris.ipynb rename modules/src/{evidently => evidently_iris}/evidently_iris.py (100%) rename modules/src/{evidently => evidently_iris}/item.yaml (96%) rename modules/src/{evidently => evidently_iris}/requirements.txt (60%) rename modules/src/{evidently => evidently_iris}/test_evidently_iris.py (100%) diff --git a/modules/src/evidently/evidently_iris.ipynb b/modules/src/evidently/evidently_iris.ipynb deleted file mode 100644 index 54f657bb0..000000000 --- a/modules/src/evidently/evidently_iris.ipynb +++ /dev/null @@ -1,37 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "initial_id", - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/modules/src/evidently_iris/evidently_iris.ipynb b/modules/src/evidently_iris/evidently_iris.ipynb new file mode 100644 index 000000000..c3299f82f --- /dev/null +++ b/modules/src/evidently_iris/evidently_iris.ipynb @@ -0,0 +1,1295 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8f92a6bb-e4b4-4b5d-91c7-2e99c97798c6", + "metadata": {}, + "source": [ + "# Evidently Iris Demo\n", + "\n", + "In this notebook, we’ll import the hub’s Evidently demo app, which monitors data quality and drift on Scikit-Learn’s Iris dataset. We’ll run it using the `evaluate()` method with a slightly modified dataset as the monitored data.\n", + "\n", + "The Evidently Iris module demonstrates a simple example of integrating MLRun with Evidently for data monitoring, which you can adapt to fit your own project needs or use as a reference implementation." + ] + }, + { + "cell_type": "markdown", + "id": "a6775277-5f4f-4261-9a06-5c6d87cb85c7", + "metadata": {}, + "source": [ + "## Set up an MLRun project and prepare the data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d7a8c256-035f-4261-b494-f3f3cbd8c77c", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n", + "project = mlrun.get_or_create_project(\"evidently-demo\",'./evidently-demo')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1e89667f-f84e-492a-a886-61104bc5ce49", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_iris\n", + "import pandas as pd\n", + "from mlrun.feature_store.api import norm_column_name\n", + "\n", + "iris = load_iris()\n", + "columns = [norm_column_name(col) for col in iris.feature_names]\n", + "current_df = pd.DataFrame(iris.data, columns=columns)\n", + "current_df[\"sepal_length_cm\"] += 0.3 # simulate drift" + ] + }, + { + "cell_type": "markdown", + "id": "af6e56af-c99d-481e-a32e-f7e5eac4ae3a", + "metadata": {}, + "source": [ + "## Get the module from the hub and edit its defaults" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "35a4bb6b-d15e-4bfd-8d04-2fa188cb36cc", + "metadata": {}, + "outputs": [], + "source": [ + "hub_mod = mlrun.get_hub_module(\"hub://evidently_iris\", download_files=True)\n", + "src_file_path = hub_mod.get_module_file_path()" + ] + }, + { + "cell_type": "markdown", + "id": "ba0c043b-7356-44da-b6d2-84eb02718482", + "metadata": {}, + "source": [ + "We need to modify the class defaults to include the Evidently workspace path and project ID parameters. This can be done in one of two ways: either by editing the downloaded source file directly and then evaluating with the standard class, or - as we’ll do now - by adding an inheriting class to the same file and evaluating using that new class.\n", + "\n", + "(Note: this is only needed when runnning the app using `evaluate()`. When setting it as a real-time function we can simply pass the parameters)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4e9253a9-58bd-4732-8eb1-80a7d15b2e7a", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import uuid\n", + "\n", + "ws = Path(\"./evidently_workspace\")\n", + "ws.mkdir(parents=True, exist_ok=True) # will create if missing\n", + "evidently_project_id = str(uuid.uuid4())\n", + "\n", + "wrapper_code = f\"\"\"\n", + "class EvidentlyIrisMonitoringAppWithWorkspaceSet(EvidentlyIrisMonitoringApp):\n", + " def __init__(self) -> None:\n", + " super().__init__(evidently_workspace_path=\"{ws}\", evidently_project_id=\"{evidently_project_id}\")\n", + " \"\"\"\n", + "\n", + "with open(src_file_path, \"a\") as f:\n", + " f.write(wrapper_code)" + ] + }, + { + "cell_type": "markdown", + "id": "5776541f-2d6f-4c10-9246-75fe14e1bbea", + "metadata": {}, + "source": [ + "Now we can actually import it as a module, using the `module()` method" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3742576d-6da2-423d-8c1c-2861712a698f", + "metadata": {}, + "outputs": [], + "source": [ + "app_module = hub_mod.module()\n", + "evidently_app = app_module.EvidentlyIrisMonitoringAppWithWorkspaceSet" + ] + }, + { + "cell_type": "markdown", + "id": "57a81ea8-f203-4152-9492-a0f7b916d02b", + "metadata": {}, + "source": [ + "## Run the app\n", + "We are ready to call `evaluate()` (notice that the run is linked to the current (active) project that we created at the beggining of the notebook)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d8103577-8523-4b64-bd67-e93bbde8dd06", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-17 09:14:43,241 [info] Changing function name - adding `\"-batch\"` suffix: {\"func_name\":\"evidentlyirismonitoringappwithworkspaceset-batch\"}\n", + "> 2025-11-17 09:14:43,580 [info] Storing function: {\"db\":\"http://mlrun-api:8080\",\"name\":\"evidentlyirismonitoringappwithworkspaceset-batch--handler\",\"uid\":\"9ecf72a1bd82498c92d5897809b6a438\"}\n", + "> 2025-11-17 09:14:43,856 [info] downloading v3io:///projects/evidently-demo/artifacts/evidentlyirismonitoringappwithworkspaceset-batch_sample_data.parquet to local temp file\n", + "> 2025-11-17 09:14:43,890 [info] Running evidently app\n", + "> 2025-11-17 09:14:46,214 [info] Logged evidently object\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartendstatekindnamelabelsinputsparametersresultsartifact_uris
evidently-demo0Nov 17 09:14:43NaTcompletedrunevidentlyirismonitoringappwithworkspaceset-batch--handler
v3io_user=iguazio
kind=local
owner=iguazio
host=jupyter-97c64f97b-8qtcv
sample_data
write_output=False
existing_data_handling=fail_on_overlap
stream_profile=None
return={result_name: 'data_drift_test', result_value: 0.5, result_kind: 0, result_status: 1, result_extra_data: '{}'}
evidently_report=store://artifacts/evidently-demo/evidentlyirismonitoringappwithworkspaceset-batch--handler_evidently_report#0@9ecf72a1bd82498c92d5897809b6a438^2f82c069b396f23b4daae81540ffa386b44f165c
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2025-11-17 09:14:46,354 [info] Run execution finished: {\"name\":\"evidentlyirismonitoringappwithworkspaceset-batch--handler\",\"status\":\"completed\"}\n" + ] + } + ], + "source": [ + "# Evaluate directly on the sample data\n", + "run_result = evidently_app.evaluate(\n", + " func_path=hub_mod.get_module_file_path(),\n", + " sample_data=current_df,\n", + " run_local=True)" + ] + }, + { + "cell_type": "markdown", + "id": "2c6843cd-70d4-4e1a-8aa2-52b6ef5b0ec9", + "metadata": {}, + "source": [ + "## Examine the results\n", + "Notice that the 0.5 value in the demo run result is not derived from Evidently’s drift metrics, but is a constant placeholder added for demonstration only.\n", + "\n", + "Let's take a look at the artifact the app generated for us:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7f1680f5-0ee7-4a82-a351-f8348bf398cc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "artifact_key = f\"{run_result.metadata.name}_evidently_report\"\n", + "artifact = project.get_artifact(artifact_key)\n", + "artifact.to_dataitem().show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base-py311", + "language": "python", + "name": "conda-env-mlrun-base-py311-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/modules/src/evidently/evidently_iris.py b/modules/src/evidently_iris/evidently_iris.py similarity index 100% rename from modules/src/evidently/evidently_iris.py rename to modules/src/evidently_iris/evidently_iris.py diff --git a/modules/src/evidently/item.yaml b/modules/src/evidently_iris/item.yaml similarity index 96% rename from modules/src/evidently/item.yaml rename to modules/src/evidently_iris/item.yaml index c6a2abc2c..262b7e1b7 100644 --- a/modules/src/evidently/item.yaml +++ b/modules/src/evidently_iris/item.yaml @@ -16,6 +16,6 @@ spec: kind: monitoring_application requirements: - scikit-learn~=1.5.2 - - evidently~=0.7.6 + - evidently~=0.7.5 - pandas version: 1.0.0 \ No newline at end of file diff --git a/modules/src/evidently/requirements.txt b/modules/src/evidently_iris/requirements.txt similarity index 60% rename from modules/src/evidently/requirements.txt rename to modules/src/evidently_iris/requirements.txt index bd4abb36f..6bd12d901 100644 --- a/modules/src/evidently/requirements.txt +++ b/modules/src/evidently_iris/requirements.txt @@ -1,3 +1,3 @@ scikit-learn~=1.5.2 -evidently~=0.7.6 +evidently~=0.7.5 pandas \ No newline at end of file diff --git a/modules/src/evidently/test_evidently_iris.py b/modules/src/evidently_iris/test_evidently_iris.py similarity index 100% rename from modules/src/evidently/test_evidently_iris.py rename to modules/src/evidently_iris/test_evidently_iris.py From 284fb2a6d7c42c9da48034c801b708505cd8439d Mon Sep 17 00:00:00 2001 From: iguazio-cicd Date: Mon, 17 Nov 2025 13:53:07 +0000 Subject: [PATCH 16/17] chore(readme): auto-update asset tables [skip ci] --- modules/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/README.md b/modules/README.md index 79da6c416..d22a576ef 100644 --- a/modules/README.md +++ b/modules/README.md @@ -7,7 +7,7 @@ | Name | Description | Kind | Categories | | --- | --- | --- | --- | | [count_events](https://github.com/mlrun/functions/tree/development/modules/src/count_events) | Count events in each time window | monitoring_application | model-serving | -| [evidently](https://github.com/mlrun/functions/tree/development/modules/src/evidently) | Demonstrates Evidently integration in MLRun for data quality and drift monitoring using the Iris dataset | monitoring_application | model-serving, structured-ML | +| [evidently_iris](https://github.com/mlrun/functions/tree/development/modules/src/evidently_iris) | Demonstrates Evidently integration in MLRun for data quality and drift monitoring using the Iris dataset | monitoring_application | model-serving, structured-ML | | [histogram_data_drift](https://github.com/mlrun/functions/tree/development/modules/src/histogram_data_drift) | Model-monitoring application for detecting and visualizing data drift | monitoring_application | model-serving, structured-ML | | [openai_proxy_app](https://github.com/mlrun/functions/tree/development/modules/src/openai_proxy_app) | OpenAI application runtime based on fastapi | generic | genai | From cdbcc2c5df14aa9b00b4635af3eb74dd83e1f8ba Mon Sep 17 00:00:00 2001 From: iguazio-cicd Date: Tue, 18 Nov 2025 08:20:04 +0000 Subject: [PATCH 17/17] chore(readme): auto-update asset tables [skip ci] --- modules/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/modules/README.md b/modules/README.md index ed2ade6d1..c3ed1c597 100644 --- a/modules/README.md +++ b/modules/README.md @@ -10,3 +10,15 @@ | [evidently_iris](https://github.com/mlrun/functions/tree/development/modules/src/evidently_iris) | Demonstrates Evidently integration in MLRun for data quality and drift monitoring using the Iris dataset | monitoring_application | model-serving, structured-ML | | [histogram_data_drift](https://github.com/mlrun/functions/tree/development/modules/src/histogram_data_drift) | Model-monitoring application for detecting and visualizing data drift | monitoring_application | model-serving, structured-ML | | [openai_proxy_app](https://github.com/mlrun/functions/tree/development/modules/src/openai_proxy_app) | OpenAI application runtime based on fastapi | generic | genai | + + +## Catalog + + +| Name | Description | Kind | Categories | +| --- | --- | --- | --- | +| [count_events](https://github.com/mlrun/functions/tree/development/modules/src/count_events) | Count events in each time window | monitoring_application | model-serving | +| [evidently_iris](https://github.com/mlrun/functions/tree/development/modules/src/evidently_iris) | Demonstrates Evidently integration in MLRun for data quality and drift monitoring using the Iris dataset | monitoring_application | model-serving, structured-ML | +| [histogram_data_drift](https://github.com/mlrun/functions/tree/development/modules/src/histogram_data_drift) | Model-monitoring application for detecting and visualizing data drift | monitoring_application | model-serving, structured-ML | +| [openai_proxy_app](https://github.com/mlrun/functions/tree/development/modules/src/openai_proxy_app) | OpenAI application runtime based on fastapi | generic | genai | +