In [21]:
from azure.ai.ml import MLClient, command, Input, Output
from azure.ai.ml.entities import Environment
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential
import os
from dotenv import load_dotenv
from pathlib import Path

# Load environment variables
load_dotenv()

True

In [22]:
# Initialize ML Client
credential = DefaultAzureCredential()
ml_client = MLClient(
    credential,
    subscription_id=os.getenv("SUBSCRIPTION_ID"),
    resource_group_name=os.getenv("RESOURCE_GROUP"),
    workspace_name=os.getenv("AZUREML_WORKSPACE_NAME")
)

print(f"Connected to workspace: {ml_client.workspace_name}")

Connected to workspace: strategicai-mlw-uks-dev-01


In [23]:
project_root = Path.cwd().parent
project_root

PosixPath('/mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Users/j.huertas/repos/azure_ml_pipelines_pa')

In [14]:
from azure.ai.ml.entities import Data, UserIdentityConfiguration, ManagedIdentityConfiguration
from azureml.core.authentication import ServicePrincipalAuthentication
identity = ManagedIdentityConfiguration(principal_id=os.getenv('AZURE_CLIENT_ID'))

svc_pr_password = os.environ.get("AZURE_CLIENT_SECRET")

svc_pr = ServicePrincipalAuthentication(
       tenant_id=os.environ.get("AZURE_TENANT_ID"),
       service_principal_id=os.environ.get("AZURE_CLIENT_ID"),
       service_principal_password=svc_pr_password,
       _enable_caching=False)

In [11]:
identity

{'type': 'managed_identity', 'principal_id': 'e77fcc8e-5551-47b6-a600-1d5633c81e31'}

In [24]:
# Initialize ML Client
credential = DefaultAzureCredential()


In [25]:
# Verify local directory structure
# If running from notebooks/ folder, go up one level to project root
current_dir = Path.cwd()
if current_dir.name == "notebooks":
    project_root = current_dir.parent
    print(f"Running from notebooks folder, using parent as project root")
else:
    project_root = current_dir
    print(f"Running from project root directly")

print(f"Current working directory: {current_dir}")
print(f"Project root: {project_root}")

# Verify required directories exist
required_dirs = {
    "PA": project_root / "PA",
    "PA/config": project_root / "PA" / "config",
    "azureml_pipeline": project_root / "azureml_pipeline"
}

for name, path in required_dirs.items():
    if path.exists():
        print(f"✓ {name} directory found at {path}")
    else:
        print(f"✗ ERROR: {name} directory not found at {path}")

# Verify required files exist
required_files = {
    "config_ecomm.yaml": project_root / "PA" / "config" / "config_ecomm.yaml",
    "config_vet.yaml": project_root / "PA" / "config" / "config_vet.yaml",
    "azureml_step1_data_prep.py": project_root / "azureml_pipeline" / "azureml_step1_data_prep.py"
}

for name, path in required_files.items():
    if path.exists():
        print(f"✓ {name} found at {path}")
    else:
        print(f"✗ WARNING: {name} not found at {path}")

# Define inputs
input_data_uri = "azureml://subscriptions/b8d6d487-0bd2-4773-b318-12ab763ed178/resourcegroups/strategicai-rg-uks-dev-01/workspaces/strategicai-mlw-uks-dev-01/datastores/landing_pa/paths/landing/azureml/"

# Choose which config to use
config_type = "ecomm"  # or "vet"

# Create the command job
job = command(
    # IMPORTANT: Set code to project root to upload all local files
    code=str(project_root),  # This uploads the entire project directory
    command=f"""python azureml_pipeline/azureml_step1_data_prep.py \
        --config PA/config/config_{config_type}.yaml \
        --input_uri ${{{{inputs.input_data}}}} \
        --incremental ${{{{inputs.incremental}}}} \
        --output_registration ${{{{outputs.registration_output}}}} \
        --output_scan ${{{{outputs.scan_output}}}} \
        --output_session ${{{{outputs.session_output}}}} \
        --output_metadata ${{{{outputs.metadata_output}}}}
    """,
    inputs={
        "input_data": Input(
            type=AssetTypes.URI_FOLDER,
            path=input_data_uri,
            description="Input data from blob storage"
        ),
        "incremental": False
    },
    outputs={
        "registration_output": Output(type=AssetTypes.URI_FOLDER),
        "scan_output": Output(type=AssetTypes.URI_FOLDER),
        "session_output": Output(type=AssetTypes.URI_FOLDER),
        "metadata_output": Output(type=AssetTypes.URI_FOLDER)
    },
    environment="personal-agendas-env:1",  # Use your environment name:version
    compute="cpu-cluster",

    display_name=f"PA Step 1: Data Preparation ({config_type.upper()})",
    description=f"Process registration, scan, and session data for {config_type.upper()}",
    tags={
        "step": "data_preparation",
        "event_type": config_type,
        "project": "personal_agendas"
    }
)


Running from notebooks folder, using parent as project root
Current working directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Users/j.huertas/repos/azure_ml_pipelines_pa/notebooks
Project root: /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Users/j.huertas/repos/azure_ml_pipelines_pa
✓ PA directory found at /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Users/j.huertas/repos/azure_ml_pipelines_pa/PA
✓ PA/config directory found at /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Users/j.huertas/repos/azure_ml_pipelines_pa/PA/config
✓ azureml_pipeline directory found at /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Users/j.huertas/repos/azure_ml_pipelines_pa/azureml_pipeline
✓ config_ecomm.yaml found at /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Users/j.huertas/repos/azure_ml_pipelines_pa/PA/config/config_ecomm.yaml
✓ config_vet.yaml found at /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-j

In [26]:
# Print job configuration
print("\n" + "="*60)
print("Job Configuration:")
print(f"  Config type: {config_type}")
print(f"  Input data: {input_data_uri}")
print(f"  Code directory: {project_root} (will be uploaded)")
print(f"  Command: python azureml_pipeline/azureml_step1_data_prep.py ...")
print("="*60)

# Submit the job
print("\nSubmitting job...")
submitted_job = ml_client.jobs.create_or_update(
    job,
    experiment_name="personal_agendas_step1_test"
)

print(f"\n✓ Job submitted successfully!")
print(f"  Job Name: {submitted_job.name}")
print(f"  Job Type: {submitted_job.type}")
print(f"  Job Status: {submitted_job.status}")
print(f"  Monitor at: {submitted_job.studio_url}")
print("\nAll local code and config files have been automatically uploaded to Azure ML.")

# Optional: Monitor job status
print("\nTo monitor job status, you can use:")
print(f"```python")
print(f"job = ml_client.jobs.get('{submitted_job.name}')")
print(f"print(f'Status: {{job.status}}')")
print(f"```")


Job Configuration:
  Config type: ecomm
  Input data: azureml://subscriptions/b8d6d487-0bd2-4773-b318-12ab763ed178/resourcegroups/strategicai-rg-uks-dev-01/workspaces/strategicai-mlw-uks-dev-01/datastores/azureml_landing/paths/landing/azureml/
  Code directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Users/j.huertas/repos/azure_ml_pipelines_pa (will be uploaded)
  Command: python azureml_pipeline/azureml_step1_data_prep.py ...

Submitting job...

✓ Job submitted successfully!
  Job Name: sharp_cassava_dnj6r2xy1s
  Job Type: command
  Job Status: Starting
  Monitor at: https://ml.azure.com/runs/sharp_cassava_dnj6r2xy1s?wsid=/subscriptions/b8d6d487-0bd2-4773-b318-12ab763ed178/resourcegroups/strategicai-rg-uks-dev-01/workspaces/strategicai-mlw-uks-dev-01&tid=3540e7dc-31b3-4057-9e31-43e9fe938179

All local code and config files have been automatically uploaded to Azure ML.

To monitor job status, you can use:
```python
job = ml_client.jobs.get('sharp_cassava_dnj6r2xy1

[32mUploading azure_ml_pipelines_pa (0.85 MBs): 100%|██████████| 852739/852739 [00:00<00:00, 2353626.80it/s]
[39m

pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
