In [None]:
import os
import json
from datetime import datetime
import pandas as pd
import numpy as np

from azure.ai.ml import MLClient, Input, Output, command
from azure.ai.ml.entities import (
    Job, 
    Environment, 
    BuildContext,
    Model,
    Data
)
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential
from azure.ai.ml.sweep import Choice, Uniform, BanditPolicy

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
SUBSCRIPTION_ID = os.getenv("AZURE_SUBSCRIPTION_ID", "3fc7fd13-533e-40a7-8e3d-f1fbf4204436")
RESOURCE_GROUP = os.getenv("AZURE_RESOURCE_GROUP", "edu-demo")
WORKSPACE_NAME = os.getenv("AZURE_WORKSPACE_NAME", "edu-demo")

try:
    credential = DefaultAzureCredential()
    ml_client = MLClient(
        credential=credential,
        subscription_id=SUBSCRIPTION_ID,
        resource_group_name=RESOURCE_GROUP,
        workspace_name=WORKSPACE_NAME,
    )
    
    print("Azure ML Client initialized successfully!")
    print(f"Workspace: {ml_client.workspace_name}")
    
except Exception as e:
    print(f"Error initializing Azure ML Client: {e}")

In [None]:
try:
    # Create environment
    env_name = "training-env"
    
    training_env = Environment(
        name=env_name,
        description="Environment for model training",
        conda_file="./src/conda.yml",
        image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest"
    )
    
    # Create the environment
    training_env = ml_client.environments.create_or_update(training_env)
    print(f"Environment '{env_name}' created successfully!")
    
except Exception as e:
    print(f"Error creating environment: {e}")
    print("Will use default environment for training jobs.")

In [None]:
try:
    # Get training data asset
    train_data = ml_client.data.get("synthetic_train_data", version="1")
    test_data = ml_client.data.get("synthetic_test_data", version="1")
    
    print("Successfully retrieved data assets:")
    print(f"- Training data: {train_data.name}:{train_data.version}")
    print(f"- Test data: {test_data.name}:{test_data.version}")
    
    use_azure_data = True
    
except Exception as e:
    print(f"Could not retrieve data assets: {e}")
    print("Will use local data files for training.")
    use_azure_data = False

In [15]:
try:
    # Define the command job
    if use_azure_data:
        train_input = Input(type=AssetTypes.URI_FILE, path=train_data.id)
        test_input = Input(type=AssetTypes.URI_FILE, path=test_data.id)
    else:
        train_input = Input(type=AssetTypes.URI_FILE, path="./data/train_data.csv")
        test_input = Input(type=AssetTypes.URI_FILE, path="./data/test_data.csv")
    
    job = command(
        experiment_name="synthetic-classification-training",
        display_name="random-forest-training",
        description="Training a Random Forest model on synthetic data",
        
        # Training script and environment
        code="./src",
        command="python train.py --train_data ${{inputs.train_data}} --test_data ${{inputs.test_data}} --model_type random_forest --n_estimators 100 --max_depth 10",
        environment=f"{env_name}@latest",
        
        # Inputs
        inputs={
            "train_data": train_input,
            "test_data": test_input,
        },
        
        # Compute
        compute="cpu-cluster",  # Use your compute cluster name or "local" for local compute
    )
    
    # Submit the job
    print("Submitting training job...")
    submitted_job = ml_client.jobs.create_or_update(job)
    
    print(f"Job submitted successfully!")
    print(f"Job name: {submitted_job.name}")
    print(f"Job status: {submitted_job.status}")
    print(f"Studio URL: {submitted_job.studio_url}")
    
    # Wait for job completion and register model
    print("\nWaiting for training job to complete...")
    
    def wait_and_register_model(job_name, timeout_minutes=30):
        """Wait for job completion and automatically register the model"""
        import time
        start_time = time.time()
        timeout_seconds = timeout_minutes * 60
        
        while time.time() - start_time < timeout_seconds:
            try:
                job = ml_client.jobs.get(job_name)
                print(f"Job status: {job.status}")
                
                if job.status == "Completed":
                    print("✅ Training job completed successfully!")
                    
                    # Automatically register the model
                    try:
                        model_name = "auto-registered-classification-model"
                        
                        # Create model from job outputs
                        model = Model(
                            path=f"azureml://jobs/{job_name}/outputs/artifacts/paths/outputs/",
                            type=AssetTypes.CUSTOM_MODEL,
                            name=model_name,
                            description=f"Auto-registered model from job {job_name}",
                            tags={
                                "training_job": job_name,
                                "framework": "scikit-learn",
                                "algorithm": "random_forest",
                                "auto_registered": "true",
                                "training_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                            }
                        )
                        
                        # Register the model
                        registered_model = ml_client.models.create_or_update(model)
                        
                        print(f"\n🎉 Model automatically registered!")
                        print(f"Model name: {registered_model.name}")
                        print(f"Model version: {registered_model.version}")
                        print(f"Model ID: {registered_model.id}")
                        
                        # Try to add performance metrics
                        try:
                            # Download job outputs to get metrics
                            ml_client.jobs.download(name=job_name, download_path="./auto_job_outputs")
                            
                            import glob
                            metrics_files = glob.glob("./auto_job_outputs/**/metrics.json", recursive=True)
                            
                            if metrics_files:
                                with open(metrics_files[0], 'r') as f:
                                    metrics = json.load(f)
                                
                                # Update model with performance metrics
                                model_with_metrics = Model(
                                    path=f"azureml://jobs/{job_name}/outputs/artifacts/paths/outputs/",
                                    type=AssetTypes.CUSTOM_MODEL,
                                    name=model_name,
                                    version=registered_model.version,
                                    description=f"Auto-registered model from job {job_name} - Accuracy: {metrics.get('test_accuracy', 0):.4f}",
                                    tags={
                                        "training_job": job_name,
                                        "framework": "scikit-learn",
                                        "algorithm": "random_forest",
                                        "auto_registered": "true",
                                        "training_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                                        "test_accuracy": str(round(metrics.get('test_accuracy', 0), 4)),
                                        "test_f1": str(round(metrics.get('test_f1', 0), 4)),
                                        "test_auc": str(round(metrics.get('test_auc', 0), 4)),
                                        "cv_accuracy": str(round(metrics.get('cv_accuracy_mean', 0), 4))
                                    }
                                )
                                
                                updated_model = ml_client.models.create_or_update(model_with_metrics)
                                print(f"✅ Model updated with performance metrics!")
                                print(f"Test Accuracy: {metrics.get('test_accuracy', 0):.4f}")
                                print(f"Test F1 Score: {metrics.get('test_f1', 0):.4f}")
                                print(f"Test AUC: {metrics.get('test_auc', 0):.4f}")
                                
                        except Exception as e:
                            print(f"Could not add performance metrics: {e}")
                            print("Model registered without detailed metrics")
                        
                        return registered_model
                        
                    except Exception as e:
                        print(f"Error registering model: {e}")
                        return None
                
                elif job.status in ["Failed", "Canceled"]:
                    print(f"❌ Training job {job.status.lower()}")
                    return None
                
                else:
                    # Job still running, wait and check again
                    time.sleep(30)
                    
            except Exception as e:
                print(f"Error checking job status: {e}")
                break
        
        print(f"⏰ Timeout reached ({timeout_minutes} minutes)")
        return None
    
    # Start monitoring and auto-registration
    registered_model = wait_and_register_model(submitted_job.name)
    
    if registered_model:
        print(f"\n🚀 Training and registration completed successfully!")
        print(f"Your model '{registered_model.name}' is ready for deployment!")
    else:
        print(f"\n⚠️ Training job may still be running or failed.")
        print(f"Check job status at: {submitted_job.studio_url}")
        print(f"You can manually register the model once training completes.")
    
except Exception as e:
    print(f"Error submitting job: {e}")
    print("\nPossible issues:")
    print("1. Compute cluster 'cpu-cluster' doesn't exist")
    print("2. Environment is still building")
    print("3. Data assets are not accessible")
    print("\nTry creating a compute cluster or use 'local' compute.")

Submitting training job...
Job submitted successfully!
Job name: silver_knee_7bhcyl7021
Job status: Starting
Studio URL: https://ml.azure.com/runs/silver_knee_7bhcyl7021?wsid=/subscriptions/3fc7fd13-533e-40a7-8e3d-f1fbf4204436/resourcegroups/edu-demo/workspaces/edu-demo&tid=249668d2-ea5d-48ad-8200-693934e8cd2b

Waiting for training job to complete...
Job status: Starting
Job status: Running
Job status: Running
Job status: Completed
✅ Training job completed successfully!

🎉 Model automatically registered!
Model name: auto-registered-classification-model
Model version: 1
Model ID: /subscriptions/3fc7fd13-533e-40a7-8e3d-f1fbf4204436/resourceGroups/edu-demo/providers/Microsoft.MachineLearningServices/workspaces/edu-demo/models/auto-registered-classification-model/versions/1


Downloading artifact azureml://datastores/workspaceartifactstore/ExperimentRun/dcid.silver_knee_7bhcyl7021 to auto_job_outputs\artifacts
ActivityCompleted: Activity=Model.CreateOrUpdate, HowEnded=Failure, Duration=838.35 [ms], Exception=HttpResponseError, ErrorCategory=UserError, ErrorMessage=(UserError) A model with this name and version already exists. If you are trying to create a new model, use a different name. If you are trying to update an existing model, the existing model's asset path cannot be changed. Only description, tags and properties can be updated.
Code: UserError
Message: A model with this name and version already exists. If you are trying to create a new model, use a different name. If you are trying to update an existing model, the existing model's asset path cannot be changed. Only description, tags and properties can be updated.
Target: $.properties.path
Additional Information:Type: ComponentName
Info: {
    "value": "managementfrontend"
}Type: Correlation
Info: {

Could not add performance metrics: (UserError) A model with this name and version already exists. If you are trying to create a new model, use a different name. If you are trying to update an existing model, the existing model's asset path cannot be changed. Only description, tags and properties can be updated.
Code: UserError
Message: A model with this name and version already exists. If you are trying to create a new model, use a different name. If you are trying to update an existing model, the existing model's asset path cannot be changed. Only description, tags and properties can be updated.
Target: $.properties.path
Additional Information:Type: ComponentName
Info: {
    "value": "managementfrontend"
}Type: Correlation
Info: {
    "value": {
        "operation": "c69541731a0e0925eca9c9c0c43c3f49",
        "request": "fc24ce513f184919"
    }
}Type: Environment
Info: {
    "value": "westeurope"
}Type: Location
Info: {
    "value": "westeurope"
}Type: Time
Info: {
    "value": "2025-

In [None]:
from azure.ai.ml.sweep import Choice, Uniform

try:
    # Define search space
    command_job_for_sweep = command(
        code="./src",
        command="python train.py --train_data ${{inputs.train_data}} --test_data ${{inputs.test_data}} --model_type ${{search_space.model_type}} --n_estimators ${{search_space.n_estimators}} --max_depth ${{search_space.max_depth}} --C ${{search_space.C}}",
        environment=f"{env_name}@latest",
        inputs={
            "train_data": train_input,
            "test_data": test_input,
        },
        compute="cpu-cluster",
    )
    
    # Define sweep job
    from azure.ai.ml import sweep
    
    sweep_job = command_job_for_sweep.sweep(
        compute="cpu-cluster",
        sampling_algorithm="random",
        primary_metric="test_accuracy",
        goal="Maximize",
        max_total_trials=4,
        max_concurrent_trials=2,
    )
    
    # Define search space
    sweep_job.search_space = {
        "model_type": Choice(["random_forest", "logistic_regression"]),
        "n_estimators": Choice([50, 100, 200]),
        "max_depth": Choice([5, 10, 15]),
        "C": Uniform(0.1, 10.0),
    }
    
    # Set experiment name
    sweep_job.experiment_name = "synthetic-classification-hyperparameter-tuning"
    sweep_job.display_name = "hyperparameter-sweep"
    sweep_job.description = "Hyperparameter tuning for classification models"
    
    # Submit sweep job
    print("Submitting hyperparameter tuning job...")
    sweep_job = ml_client.jobs.create_or_update(sweep_job)
    
    print(f"Sweep job submitted successfully!")
    print(f"Job name: {sweep_job.name}")
    print(f"Studio URL: {sweep_job.studio_url}")
    
except Exception as e:
    print(f"Error submitting sweep job: {e}")
    print("Hyperparameter tuning requires a compute cluster to be available.")

In [16]:
try:
    # List recent jobs
    print("Recent training jobs:")
    jobs = ml_client.jobs.list(max_results=5)
    
    for job in jobs:
        print(f"- {job.name}: {job.status} ({job.experiment_name})")
        
except Exception as e:
    print(f"Error listing jobs: {e}")

Recent training jobs:
- silver_knee_7bhcyl7021: Completed (synthetic-classification-training)
- sleepy_cumin_7cwbm44q29: Completed (synthetic-classification-hyperparameter-tuning)
- salmon_vase_ms4hs9njqb: Completed (synthetic-classification-training)
- imgbldrun_5459fe7: Completed (prepare_image)


In [17]:
try:
    print("Registered models in workspace:")
    models = ml_client.models.list()
    
    for model in models:
        print(f"- {model.name} (v{model.version}): {model.description}")
        if hasattr(model, 'tags') and model.tags:
            for key, value in model.tags.items():
                print(f"  {key}: {value}")
        print()
        
except Exception as e:
    print(f"Error listing models: {e}")

Registered models in workspace:
- azureml_affable_guava_wmhch0vkhm_output_mlflow_log_model_1848236288 (vNone): None

- azureml_affable_guava_wmhch0vkhm_output_mlflow_log_model_1192329897 (vNone): None

- iris_model (vNone): None

- azureml_placid_grape_ntjw9sknwc_output_mlflow_log_model_380776297 (vNone): None

- azureml_placid_grape_ntjw9sknwc_output_mlflow_log_model_282143572 (vNone): None

- azureml_boring_juice_tpr5g52cbs_output_mlflow_log_model_908092165 (vNone): None

- azureml_boring_juice_tpr5g52cbs_output_mlflow_log_model_1250250616 (vNone): None

- azureml_nifty_clock_byqw0241xy_output_mlflow_log_model_83605581 (vNone): None

- azureml_nifty_clock_byqw0241xy_output_mlflow_log_model_472997680 (vNone): None

- azureml_orange_crayon_4641st7vf0_output_mlflow_log_model_489039716 (vNone): None

- azureml_orange_crayon_4641st7vf0_output_mlflow_log_model_472997680 (vNone): None

- azureml_olden_airport_xmrqwhgcc3_output_mlflow_log_model_908092165 (vNone): None

- azureml_olden_airpor