In [None]:
# https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/cloud-evaluation

In [1]:
import os
import json
from azure.ai.projects import AIProjectClient
from azure.ai.projects.models import (
    CodeInterpreterTool,
    BingGroundingTool,
    FunctionTool
)
from azure.identity import DefaultAzureCredential
from typing import Any
from pathlib import Path

from azure.ai.ml import MLClient
import mlflow


# AI Project Client and ML Client 

AI Project Client will be used to retrieve the list of the evaluations in a project, can be used to submit a new evaluation, and configure evaluators. 

ML Client, the underlying Azure ML Workspace, will be used to retrieve job run detail that correspond to the evaluation job, then MLFlow is used to retrieve metrics and other details. 

In [2]:
project_client = AIProjectClient.from_connection_string(
    conn_str=os.environ["AZURE_AI_PROJECT_CONN_STR"],
    credential=DefaultAzureCredential()
)

In [3]:
project_client.scope

{'subscription_id': 'e9b9c71e-d2ba-4086-a5ed-2923e15962d9',
 'resource_group_name': 'rg-mutaza-5102_ai',
 'project_name': 'ai-inference-eastus2'}

# Retrieve evluations from AI Project

In [3]:
[(
    ev.id,
    ev.display_name,
    ", ".join([evva for evva in ev.evaluators]),
    ev.status,
    type(ev)
    )
 for ev in project_client.evaluations.list()]

[('46a30d94-5184-43b6-a281-1fd75c0f088a',
  'evaluation_sample_eval',
  'relevance, coherence, similarity, fluency, f1_score, bleu, gleu, rouge, meteor, self_harm, hate_unfairness, violence, indirect_attack, protected_material, sexual',
  'Completed',
  azure.ai.projects.models._models.Evaluation)]

In [4]:
eval_job_id = "46a30d94-5184-43b6-a281-1fd75c0f088a"
eval = project_client.evaluations.get(id=eval_job_id)
print(eval.id)

46a30d94-5184-43b6-a281-1fd75c0f088a


In [5]:
eval.as_dict()

{'id': '46a30d94-5184-43b6-a281-1fd75c0f088a',
 'data': {'type': 'Dataset',
  'id': 'azureml://locations/eastus2/workspaces/fd3837d7-cb17-42c2-8f2d-6e6163678714/data/eval-data-2025-03-23_090658_UTC/versions/1'},
 'target': None,
 'description': '',
 'displayName': 'evaluation_sample_eval',
 'systemData': {'createdBy': 'Mutaz Abu Ghazaleh (AI GBB)',
  'modifiedBy': 'Mutaz Abu Ghazaleh (AI GBB)'},
 'status': 'Completed',
 'tags': {'tag1': 'v1'},
 'properties': {'evaluationType': '',
  'runType': 'eval_run',
  '_azureml.evaluation_run': 'evaluation.service',
  '_azureml.evaluate_artifacts': '[{"path": "instance_results.jsonl", "type": "table"}]',
  '_azureml.ComputeTargetType': 'amlctrain',
  '_azureml.ClusterName': None,
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json',
  'StartTimeUtc': '2025-03-23 09:11:17',
  'EndTimeUtc': '2025-03-23 09:14:21',
  'OutputAssetUri': 'azureml://locations/eastus2/workspaces/fd3837d7-cb17-42c

In [6]:
eval.properties.keys()

dict_keys(['evaluationType', 'runType', '_azureml.evaluation_run', '_azureml.evaluate_artifacts', '_azureml.ComputeTargetType', '_azureml.ClusterName', 'ProcessInfoFile', 'ProcessStatusFile', 'StartTimeUtc', 'EndTimeUtc', 'OutputAssetUri', 'AiStudioEvaluationUri'])

In [7]:
eval.properties["_azureml.evaluation_run"]

'evaluation.service'

In [8]:
eval.items()

dict_items([('id', '46a30d94-5184-43b6-a281-1fd75c0f088a'), ('data', {'type': 'Dataset', 'id': 'azureml://locations/eastus2/workspaces/fd3837d7-cb17-42c2-8f2d-6e6163678714/data/eval-data-2025-03-23_090658_UTC/versions/1'}), ('target', None), ('description', ''), ('displayName', 'evaluation_sample_eval'), ('systemData', {'createdBy': 'Mutaz Abu Ghazaleh (AI GBB)', 'modifiedBy': 'Mutaz Abu Ghazaleh (AI GBB)'}), ('status', 'Completed'), ('tags', {'tag1': 'v1'}), ('properties', {'evaluationType': '', 'runType': 'eval_run', '_azureml.evaluation_run': 'evaluation.service', '_azureml.evaluate_artifacts': '[{"path": "instance_results.jsonl", "type": "table"}]', '_azureml.ComputeTargetType': 'amlctrain', '_azureml.ClusterName': None, 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/process_status.json', 'StartTimeUtc': '2025-03-23 09:11:17', 'EndTimeUtc': '2025-03-23 09:14:21', 'OutputAssetUri': 'azureml://locations/eastus2/workspaces/fd3837d7-cb17-42c2-8f

# Evaluation Metrics

Connect to the ML workspace to read the run details of the evaluation including metrics 

In [9]:

ml_client = MLClient(
    credential=DefaultAzureCredential(),
    subscription_id=project_client.scope["subscription_id"],
    resource_group_name=project_client.scope["resource_group_name"],
    workspace_name=project_client.scope["project_name"]
)
# Set the MLflow tracking URI to the workspace's MLflow tracking URI
mlflow.set_tracking_uri(ml_client.workspaces.get(ml_client.workspace_name).mlflow_tracking_uri)

In [10]:
print(json.dumps(project_client.scope, indent=2))

print(f"ML Workspace: {ml_client.workspace_name}")

print(f"ML Workspace Tracking URI: {ml_client.workspaces.get(ml_client.workspace_name).mlflow_tracking_uri}")

print(f"MLFlow tracking uri {mlflow.get_tracking_uri()}")


{
  "subscription_id": "e9b9c71e-d2ba-4086-a5ed-2923e15962d9",
  "resource_group_name": "rg-mutaza-5102_ai",
  "project_name": "ai-inference-eastus2"
}
ML Workspace: ai-inference-eastus2
ML Workspace Tracking URI: azureml://eastus2.api.azureml.ms/mlflow/v2.0/subscriptions/e9b9c71e-d2ba-4086-a5ed-2923e15962d9/resourceGroups/rg-mutaza-5102_ai/providers/Microsoft.MachineLearningServices/workspaces/ai-inference-eastus2
MLFlow tracking uri azureml://eastus2.api.azureml.ms/mlflow/v2.0/subscriptions/e9b9c71e-d2ba-4086-a5ed-2923e15962d9/resourceGroups/rg-mutaza-5102_ai/providers/Microsoft.MachineLearningServices/workspaces/ai-inference-eastus2


## ML Workspace jobs

using the ml_client.jobs and mlflow client to retrieve the job details and metrics. 

```python

In [11]:
for j in ml_client.jobs.list():
    print(f"Job: {j.name} - {j.status}")
    print(j)

Job: 46a30d94-5184-43b6-a281-1fd75c0f088a - Completed


Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


type: command
inputs:
  eval_data:
    mode: ro_mount
    type: uri_file
    path: azureml://locations/eastus2/workspaces/fd3837d7-cb17-42c2-8f2d-6e6163678714/data/eval-data-2025-03-23_090658_UTC/versions/1
  relevance_evaluator:
    mode: ro_mount
    type: custom_model
    path: azureml://registries/azureml/models/Relevance-Evaluator/versions/5
  coherence_evaluator:
    mode: ro_mount
    type: custom_model
    path: azureml://registries/azureml/models/Coherence-Evaluator/versions/5
  similarity_evaluator:
    mode: ro_mount
    type: custom_model
    path: azureml://registries/azureml/models/Similarity-Evaluator/versions/3
  fluency_evaluator:
    mode: ro_mount
    type: custom_model
    path: azureml://registries/azureml/models/Fluency-Evaluator/versions/5
  f1_score_evaluator:
    mode: ro_mount
    type: custom_model
    path: azureml://registries/azureml/models/F1Score-Evaluator/versions/3
  bleu_evaluator:
    mode: ro_mount
    type: custom_model
    path: azureml://registri

In [12]:
ml_eval_job = ml_client.jobs.get(eval.id)
print(type(ml_eval_job))
print(json.dumps(ml_eval_job.properties, indent=2))
ml_eval_job

<class 'azure.ai.ml.entities._builders.command.Command'>
{
  "evaluationType": "",
  "runType": "eval_run",
  "_azureml.evaluation_run": "evaluation.service",
  "_azureml.evaluate_artifacts": "[{\"path\": \"instance_results.jsonl\", \"type\": \"table\"}]",
  "_azureml.ComputeTargetType": "amlctrain",
  "_azureml.ClusterName": null,
  "ProcessInfoFile": "azureml-logs/process_info.json",
  "ProcessStatusFile": "azureml-logs/process_status.json",
  "StartTimeUtc": "2025-03-23 09:11:17",
  "EndTimeUtc": "2025-03-23 09:14:21"
}


Experiment,Name,Type,Status,Details Page
Default,46a30d94-5184-43b6-a281-1fd75c0f088a,command,Completed,Link to Azure Machine Learning studio


In [13]:
ml_eval_job["to_pandas"]["getdoc"]
type(ml_eval_job)

ml_eval_job.base_path

ml_eval_job.tags
ml_eval_job.outputs['eval_output'].path

In [14]:
mlflow.search_runs()


  from google.protobuf import service as _service


Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.rouge.rouge_recall,metrics.protected_material.protected_material_defect_rate,metrics.hate_unfairness.hate_unfairness_score,metrics.gleu.gleu_score,...,metrics.fluency.gpt_fluency,metrics.meteor.meteor_score,metrics.fluency.fluency,metrics.bleu.bleu_score,metrics.similarity.gpt_similarity,params.myparam,tags.mlflow.user,tags.mlflow.runName,tags.mlflow.rootRunId,tags.tag1
0,46a30d94-5184-43b6-a281-1fd75c0f088a,50b7cf29-4a8a-4cba-b6c7-8c60d76d739a,FINISHED,,2025-03-23 09:11:17.815000+00:00,2025-03-23 09:14:21.306000+00:00,0.698529,0.0,0.0,0.113872,...,3.0,0.529203,3.0,0.042665,4.0,,Mutaz Abu Ghazaleh (AI GBB),evaluation_sample_eval,46a30d94-5184-43b6-a281-1fd75c0f088a,v1
1,ebfd253d-3c78-4f5e-b6d0-7a63b743be72,50b7cf29-4a8a-4cba-b6c7-8c60d76d739a,FINISHED,,2025-03-23 11:41:47.471000+00:00,2025-03-23 11:43:02.191000+00:00,,,,,...,,,,,,v1,Mutaz Abu Ghazaleh (AI GBB),gentle_vase_krmh0j29,ebfd253d-3c78-4f5e-b6d0-7a63b743be72,


In [15]:
mlflow_eval_run = mlflow.get_run(eval.id)

In [16]:
print(f"mlflow run id: {mlflow_eval_run.info.run_id}")
print(f"mlflow run tags: {ml_eval_job.tags}")

mlflow_eval_run.to_dictionary()

mlflow run id: 46a30d94-5184-43b6-a281-1fd75c0f088a
mlflow run tags: {'tag1': 'v1'}


{'info': {'artifact_uri': 'azureml://eastus2.api.azureml.ms/mlflow/v2.0/subscriptions/e9b9c71e-d2ba-4086-a5ed-2923e15962d9/resourceGroups/rg-mutaza-5102_ai/providers/Microsoft.MachineLearningServices/workspaces/ai-inference-eastus2/experiments/50b7cf29-4a8a-4cba-b6c7-8c60d76d739a/runs/46a30d94-5184-43b6-a281-1fd75c0f088a/artifacts',
  'end_time': 1742721261306,
  'experiment_id': '50b7cf29-4a8a-4cba-b6c7-8c60d76d739a',
  'lifecycle_stage': 'active',
  'run_id': '46a30d94-5184-43b6-a281-1fd75c0f088a',
  'run_name': 'evaluation_sample_eval',
  'run_uuid': '46a30d94-5184-43b6-a281-1fd75c0f088a',
  'start_time': 1742721077815,
  'status': 'FINISHED',
  'user_id': 'a69194f3-f66e-4be9-9459-0f9add2a7439'},
 'data': {'metrics': {'relevance.relevance': 4.0,
   'relevance.gpt_relevance': 4.0,
   'similarity.gpt_similarity': 4.0,
   'coherence.coherence': 4.0,
   'f1_score.f1_score': 0.44944341375,
   'gleu.gleu_score': 0.1138716356,
   'fluency.fluency': 3.0,
   'bleu.bleu_score': 0.04266535875,

In [17]:
mlflow_eval_run.data.metrics

{'relevance.relevance': 4.0,
 'relevance.gpt_relevance': 4.0,
 'similarity.gpt_similarity': 4.0,
 'coherence.coherence': 4.0,
 'f1_score.f1_score': 0.44944341375,
 'gleu.gleu_score': 0.1138716356,
 'fluency.fluency': 3.0,
 'bleu.bleu_score': 0.04266535875,
 'coherence.gpt_coherence': 4.0,
 'fluency.gpt_fluency': 3.0,
 'rouge.rouge_precision': 0.37147335420000005,
 'rouge.rouge_f1_score': 0.48402839400000003,
 'rouge.rouge_recall': 0.69852941175,
 'meteor.meteor_score': 0.5292030105000001,
 'self_harm.self_harm_score': 0.0,
 'hate_unfairness.hate_unfairness_score': 0.0,
 'violence.violence_score': 0.0,
 'indirect_attack.xpia_manipulated_content': 0.0,
 'indirect_attack.xpia_intrusion': 0.0,
 'indirect_attack.xpia_information_gathering': 0.0,
 'indirect_attack.xpia_defect_rate': 0.0,
 'sexual.sexual_score': 0.0,
 'protected_material.protected_material_defect_rate': 0.0}

## Submit an evaluation
