# Personal Agendas Complete Pipeline for BVA

Runs the end-to-end Personal Agendas Azure ML pipeline for the BVA veterinary agenda using the LVA-style configuration. It handles:

- Step 1: Data preparation (registration, scan, session normalization)
- Step 2: Neo4j preparation (visitors, relationships, streams)
- Step 3: Session embeddings (text embeddings persisted back to Neo4j)
- Step 4: Recommendations (control groups + exports)

It is wired to `PA/config/config_vet_bva.yaml`, reads the staged blobs in `landing/azureml/data/bva`, and pins the Neo4j work to the PROD graph unless you override `PA_NEO4J_ENVIRONMENT`.

In [None]:
# Import required libraries
import os
import sys
import json
from pathlib import Path
from datetime import datetime

import yaml

# Azure ML imports
from azure.ai.ml import MLClient, command, Input, Output
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.entities import Environment, AmlCompute
from azure.ai.ml.dsl import pipeline
from azure.identity import DefaultAzureCredential
from azure.identity import ClientSecretCredential
from azure.core.exceptions import ResourceNotFoundError

from dotenv import load_dotenv

DEFAULT_PA_ENV_VERSION = 12

# Load environment variables
load_dotenv()
os.environ.setdefault("PA_ENV_VERSION", str(DEFAULT_PA_ENV_VERSION))

print("="*60)
print("AZURE ML PIPELINE COMPLETE")
print("="*60)

# Configuration
client_id = os.getenv("AZURE_CLIENT_ID")
client_secret = os.getenv("AZURE_CLIENT_SECRET")
tenant_id = os.getenv("AZURE_TENANT_ID")
subscription_id = os.getenv("SUBSCRIPTION_ID")
resource_group = os.getenv("RESOURCE_GROUP")
workspace_name = os.getenv("AZUREML_WORKSPACE_NAME")

# Compute targets (defaults can be overridden via environment variables)
default_compute_target = os.getenv("PA_DEFAULT_COMPUTE", "cpu-cluster")
recommendations_compute_target = os.getenv("PA_STEP4_COMPUTE", "cpu-cluster-3")

print(f"Azure ML SDK Version: {sys.version}")
print(f"Current time: {datetime.now().isoformat()}")
print(f"Target Azure ML env version: {os.getenv('PA_ENV_VERSION')}")
print(f"Default compute target: {default_compute_target}")
print(f"Recommendations compute target: {recommendations_compute_target}")

In [None]:
load_dotenv()

In [None]:
os.environ["MLFLOW_TRACKING_URI"] = "databricks"
os.environ["PA_NEO4J_ENVIRONMENT"] = "PROD"

In [None]:
databricks_token = os.getenv("DATABRICKS_TOKEN")
databricks_host = os.getenv("DATABRICKS_HOST")
mlflow_tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow_registry_uri = os.getenv("MLFLOW_REGISTRY_URI")
mlflow_experiment_id = os.getenv("MLFLOW_EXPERIMENT_ID")

if not all([databricks_token, databricks_host, mlflow_tracking_uri, mlflow_registry_uri, mlflow_experiment_id]):
    print("WARNING: Databricks / MLFLOW credentials not found")
    print("Please set DATABRICKS_TOKEN, DATABRICKS_HOST, MLFLOW_TRACKING_URI, MLFLOW_REGISTRY_URI and MLFLOW_EXPERIMENT_ID")
else:
    print(f"Databricks / MLFLOW credentials configured {databricks_host}, {mlflow_tracking_uri}, {mlflow_registry_uri}, {mlflow_experiment_id}")

In [None]:
# IMPORTANT: Configure Neo4j credentials
ENVIRONMENT = os.getenv("PA_NEO4J_ENVIRONMENT")
neo4j_username = os.getenv("NEO4J_USERNAME")
if ENVIRONMENT == "DEV":
    neo4j_uri = os.getenv("NEO4J_URI_DEV")
    neo4j_password = os.getenv("NEO4J_PASSWORD_DEV")
elif ENVIRONMENT == "TEST":
    neo4j_uri = os.getenv("NEO4J_URI_TEST")
    neo4j_password = os.getenv("NEO4J_PASSWORD_TEST")
elif ENVIRONMENT == "PROD":
    neo4j_uri = os.getenv("NEO4J_URI_PROD")
    neo4j_password = os.getenv("NEO4J_PASSWORD_PROD")
else:
    neo4j_uri = os.getenv("NEO4J_URI")
    neo4j_password = os.getenv("NEO4J_PASSWORD")

# Set final values for downstream steps
if neo4j_uri:
    os.environ["NEO4J_URI"] = neo4j_uri
if neo4j_password:
    os.environ["NEO4J_PASSWORD"] = neo4j_password

if not all([neo4j_uri, neo4j_username, neo4j_password]):
    print("WARNING: Neo4j credentials not found")
    print("Please set NEO4J_URI, NEO4J_USERNAME, and NEO4J_PASSWORD (or ENV-specific overrides).")
else:
    print(f"Neo4j credentials configured for {ENVIRONMENT}")

In [None]:
# Validate required environment variables (aligned with Key Vault secret map)
from pprint import pprint

required_env_vars = [
    "AZURE_CLIENT_ID",
    "AZURE_CLIENT_SECRET",
    "AZURE_TENANT_ID",
    "SUBSCRIPTION_ID",
    "RESOURCE_GROUP",
    "AZUREML_WORKSPACE_NAME",
    "KEYVAULT_NAME",
    "OPENAI_API_KEY",
    "AZURE_API_KEY",
    "AZURE_ENDPOINT",
    "AZURE_DEPLOYMENT",
    "AZURE_API_VERSION",
    "NEO4J_URI",
    "NEO4J_USERNAME",
    "NEO4J_PASSWORD",
    "DATABRICKS_TOKEN",
    "DATABRICKS_HOST",
    "MLFLOW_TRACKING_URI",
    "MLFLOW_REGISTRY_URI",
    "MLFLOW_EXPERIMENT_ID",
    "APPLICATIONINSIGHTS_CONNECTION_STRING",
    "PA_ENV_VERSION",
    "PA_NEO4J_ENVIRONMENT",
    "NEO4J_URI_DEV",
    "NEO4J_URI_TEST",
    "NEO4J_URI_PROD",
    "NEO4J_PASSWORD_DEV",
    "NEO4J_PASSWORD_TEST",
    "NEO4J_PASSWORD_PROD",
]

missing_env = [var for var in required_env_vars if not os.getenv(var)]
if missing_env:
    print("WARNING: The following environment variables are missing. Confirm notebooks/.env or Key Vault entries:")
    pprint(missing_env)
else:
    print("All required environment variables are present.")

In [None]:
# Environment variables propagated to every pipeline step
current_dir = Path.cwd()
project_root_for_env = current_dir.parent if current_dir.name == "notebooks" else current_dir
pythonpath_entries = [
    project_root_for_env.as_posix(),
    (project_root_for_env / "PA").as_posix(),
]
path_separator = ";" if os.name == "nt" else ":"
pythonpath_value = path_separator.join([entry for entry in pythonpath_entries if entry])
keyvault_uri = None
if os.getenv("KEYVAULT_NAME"):
    keyvault_uri = f"https://{os.getenv('KEYVAULT_NAME')}.vault.azure.net/"
pa_env_version = os.getenv("PA_ENV_VERSION", str(DEFAULT_PA_ENV_VERSION))
otel_resource_detectors = os.getenv("OTEL_EXPERIMENTAL_RESOURCE_DETECTORS", "otel")
otel_service_name = os.getenv("OTEL_SERVICE_NAME", "pa-azureml-step")
env_vars_template = {
    "AZURE_CLIENT_ID": os.getenv("AZURE_CLIENT_ID"),
    "AZURE_CLIENT_SECRET": os.getenv("AZURE_CLIENT_SECRET"),
    "AZURE_TENANT_ID": os.getenv("AZURE_TENANT_ID"),
    "SUBSCRIPTION_ID": os.getenv("SUBSCRIPTION_ID"),
    "RESOURCE_GROUP": os.getenv("RESOURCE_GROUP"),
    "AZUREML_WORKSPACE_NAME": os.getenv("AZUREML_WORKSPACE_NAME"),
    "STORAGE_ACCOUNT_NAME": "strategicaistuksdev02",
    "KEYVAULT_NAME": os.getenv("KEYVAULT_NAME"),
    "KEYVAULT_URI": keyvault_uri,
    "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
    "AZURE_API_KEY": os.getenv("AZURE_API_KEY"),
    "AZURE_ENDPOINT": os.getenv("AZURE_ENDPOINT"),
    "AZURE_DEPLOYMENT": os.getenv("AZURE_DEPLOYMENT"),
    "AZURE_API_VERSION": os.getenv("AZURE_API_VERSION"),
    "AZURE_OPENAI_API_KEY": os.getenv("AZURE_OPENAI_API_KEY"),
    "AZURE_OPENAI_ENDPOINT": os.getenv("AZURE_OPENAI_ENDPOINT"),
    "AZURE_DEPLOYMENT_BATCH": os.getenv("AZURE_DEPLOYMENT_BATCH"),
    "AZURE_API_VERSION_BATCH": os.getenv("AZURE_API_VERSION_BATCH"),
    "NEO4J_URI": os.getenv("NEO4J_URI"),
    "NEO4J_USERNAME": os.getenv("NEO4J_USERNAME"),
    "NEO4J_PASSWORD": os.getenv("NEO4J_PASSWORD"),
    "NEO4J_URI_DEV": os.getenv("NEO4J_URI_DEV"),
    "NEO4J_PASSWORD_DEV": os.getenv("NEO4J_PASSWORD_DEV"),
    "NEO4J_URI_TEST": os.getenv("NEO4J_URI_TEST"),
    "NEO4J_PASSWORD_TEST": os.getenv("NEO4J_PASSWORD_TEST"),
    "NEO4J_URI_PROD": os.getenv("NEO4J_URI_PROD"),
    "NEO4J_PASSWORD_PROD": os.getenv("NEO4J_PASSWORD_PROD"),
    "DATABRICKS_TOKEN": os.getenv("DATABRICKS_TOKEN"),
    "DATABRICKS_HOST": os.getenv("DATABRICKS_HOST"),
    "DATABRICKS_HOSTS": os.getenv("DATABRICKS_HOSTS"),
    "MLFLOW_TRACKING_URI": os.getenv("MLFLOW_TRACKING_URI"),
    "MLFLOW_REGISTRY_URI": os.getenv("MLFLOW_REGISTRY_URI"),
    "MLFLOW_EXPERIMENT_ID": os.getenv("MLFLOW_EXPERIMENT_ID"),
    "APPLICATIONINSIGHTS_CONNECTION_STRING": os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING"),
    "AZURE_APPINSIGHTS_CONNECTION_STRING": os.getenv("AZURE_APPINSIGHTS_CONNECTION_STRING"),
    "APPINSIGHTS_CONNECTION_STRING": os.getenv("APPINSIGHTS_CONNECTION_STRING"),
    "APPLICATIONINSIGHTS_AUTHENTICATION_MODE": os.getenv("APPLICATIONINSIGHTS_AUTHENTICATION_MODE"),
    "OTEL_EXPERIMENTAL_RESOURCE_DETECTORS": otel_resource_detectors,
    "OTEL_SERVICE_NAME": otel_service_name,
    "PA_ENV_VERSION": pa_env_version,
    "PYTHONPATH": pythonpath_value,
    "PA_NEO4J_ENVIRONMENT": os.getenv("PA_NEO4J_ENVIRONMENT"),
}

environment_variables = {key: value for key, value in env_vars_template.items() if value not in (None, "")}

print(f"Propagating {len(environment_variables)} environment variables to every step")

In [None]:
# Create ML Client
credential = ClientSecretCredential(
    tenant_id=tenant_id,
    client_id=client_id,
    client_secret=client_secret
)

ml_client = MLClient(
    credential,
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name
)

print(f"Connected to workspace: {workspace_name}")

In [None]:
# Verify project structure
current_dir = Path.cwd()
if current_dir.name == "notebooks":
    project_root = current_dir.parent
    print("Running from notebooks folder")
else:
    project_root = current_dir
    print("Running from project root")

print(f"Project root: {project_root}")

pa_dir = project_root / "PA"
if not pa_dir.exists():
    print(f"ERROR: PA directory not found at {pa_dir}")
else:
    print("✓ PA directory found")

pipeline_dir = project_root / "azureml_pipeline"
if not pipeline_dir.exists():
    print("Creating azureml_pipeline directory")
    pipeline_dir.mkdir(exist_ok=True)
else:
    print("✓ azureml_pipeline directory found")

step_scripts = {
    "Step 1": pipeline_dir / "azureml_step1_data_prep.py",
    "Step 2": pipeline_dir / "azureml_step2_neo4j_prep.py",
    "Step 3": pipeline_dir / "azureml_step3_session_embedding.py",
    "Step 4": pipeline_dir / "azureml_step4_recommendations.py"
}

for step_name, script_path in step_scripts.items():
    if script_path.exists():
        print(f"✓ {step_name} script found")
    else:
        print(f"WARNING: {step_name} script not found at {script_path}")

config_path = pa_dir / "config" / "config_vet_bva.yaml"
if config_path.exists():
    print("✓ config_vet_bva.yaml found")
else:
    print(f"ERROR: config_vet_bva.yaml not found at {config_path}")

In [None]:
# Environment setup (auto-version logic)
dependencies_dir = Path("./env")
if not dependencies_dir.exists():
    notebooks_env = project_root / "notebooks" / "env"
    repo_env = project_root / "env"
    if notebooks_env.exists():
        dependencies_dir = notebooks_env
    elif repo_env.exists():
        dependencies_dir = repo_env
custom_env_name = "pa-env"
requested_version_raw = os.getenv("PA_ENV_VERSION", str(DEFAULT_PA_ENV_VERSION))
try:
    requested_env_version = int(requested_version_raw)
except ValueError:
    print(f"WARNING: Invalid PA_ENV_VERSION '{requested_version_raw}'. Auto-incrementing from workspace.")
    requested_env_version = None

existing_envs = {}
existing_versions = []
try:
    for env in ml_client.environments.list(name=custom_env_name):
        version_str = str(env.version)
        if version_str.isdigit():
            version_int = int(version_str)
            existing_versions.append(version_int)
            existing_envs[version_int] = env
except Exception as exc:
    print(f"Warning: unable to enumerate existing environments ({exc})")

latest_version = max(existing_versions) if existing_versions else None

conda_file = dependencies_dir / "conda.yaml"
if not conda_file.exists():
    print(f"WARNING: conda.yaml not found at {conda_file}")
    print("Using default conda configuration")

if requested_env_version is not None and requested_env_version in existing_envs:
    job_env = existing_envs[requested_env_version]
    print(f"Using existing environment: {job_env.name}:{job_env.version}")
elif latest_version is not None and requested_env_version == latest_version:
    job_env = existing_envs[latest_version]
    print(f"Notebook version matches latest workspace version ({latest_version}); reusing environment {job_env.name}:{job_env.version}")
else:
    version_to_create = (
        requested_env_version
        if requested_env_version is not None
        else (latest_version + 1 if latest_version is not None else 1)
    )

    if latest_version is not None and version_to_create <= latest_version:
        version_to_create = latest_version + 1
        print(f"Requested version already exists or is older. Incrementing to {version_to_create}.")

    print(f"Creating environment version {version_to_create} for {custom_env_name}")
    environment_kwargs = {
        "name": custom_env_name,
        "version": str(version_to_create),
        "description": "Environment for Personal Agendas pipeline with Neo4j and embeddings",
        "conda_file": str(conda_file) if conda_file.exists() else None,
        "image": "mcr.microsoft.com/azureml/openmpi5.0-ubuntu24.04:20250601.v1",
    }

    job_env = Environment(**environment_kwargs)
    job_env = ml_client.environments.create_or_update(job_env)
    print(f"Created environment: {job_env.name}:{job_env.version}")

In [None]:
# Define INCREMENTAL versions of the components (optional)
session_embedding_component_incremental = command(
    name="session_embedding_incremental",
    display_name="Step 3: Session Embedding (Incremental)",
    description="Generate embeddings only for new sessions without embeddings",
    inputs={
        "config_type": Input(type="string", default="vet_bva"),
        "neo4j_ready": Input(type="uri_folder", optional=True),
        "input_registration": Input(type="uri_folder"),
        "input_scan": Input(type="uri_folder"),
        "input_session": Input(type="uri_folder"),
        "neo4j_environment": Input(type="string", default=""),
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step3_session_embedding.py \",
,
,
,
,
,
true
,
"",
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables=environment_variables,
    compute="cpu-cluster",
    is_deterministic=False
)


In [None]:
# Step 1: Data Preparation Component
data_preparation_component = command(
    name="data_preparation",
    display_name="Step 1: Data Preparation",
    description="Process registration, scan, and session data",
    inputs={
        "input_uri": Input(type="uri_folder"),
        "config_type": Input(type="string", default="vet_bva"),
        "incremental": Input(type="string", default="false"),
    },
    outputs={
        "registration_output": Output(type="uri_folder"),
        "scan_output": Output(type="uri_folder"),
        "session_output": Output(type="uri_folder"),
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step1_data_prep.py \\
        --config PA/config/config_${{inputs.config_type}}.yaml \\
        --input_uri ${{inputs.input_uri}} \\
        --incremental ${{inputs.incremental}} \\
        --output_registration ${{outputs.registration_output}} \\
        --output_scan ${{outputs.scan_output}} \\
        --output_session ${{outputs.session_output}} \\
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables=environment_variables,
    compute="cpu-cluster",
    is_deterministic=False
)

In [None]:
# Step 2: Neo4j Preparation Component
neo4j_preparation_component = command(
    name="neo4j_preparation",
    display_name="Step 2: Neo4J Preparation",
    description="Upload data to Neo4j database",
    inputs={
        "config_type": Input(type="string", default="vet_bva"),
        "input_registration": Input(type="uri_folder"),
        "input_scan": Input(type="uri_folder"),
        "input_session": Input(type="uri_folder"),
        "incremental": Input(type="string", default="false"),
        "neo4j_environment": Input(type="string", default=""),
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step2_neo4j_prep.py \\
        --config PA/config/config_${{inputs.config_type}}.yaml \\
        --input_registration ${{inputs.input_registration}} \\
        --input_scan ${{inputs.input_scan}} \\
        --input_session ${{inputs.input_session}} \\
        --incremental ${{inputs.incremental}} \\
        --neo4j_environment ${{inputs.neo4j_environment}} \\
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables=environment_variables,
    compute="cpu-cluster",
    is_deterministic=False
)

print("✓ Step 2 component defined")

In [None]:
# Step 3: Session Embedding Component
session_embedding_component = command(
    name="session_embedding",
    display_name="Step 3: Session Embedding",
    description="Generate and store text embeddings for session nodes in Neo4j",
    inputs={
        "config_type": Input(type="string", default="vet_bva"),
        "neo4j_ready": Input(type="uri_folder", optional=True),
        "input_registration": Input(type="uri_folder"),
        "input_scan": Input(type="uri_folder"),
        "input_session": Input(type="uri_folder"),
        "incremental": Input(type="string", default="false"),
        "neo4j_environment": Input(type="string", default=""),
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step3_session_embedding.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --input_registration ${{inputs.input_registration}} \
        --input_scan ${{inputs.input_scan}} \
        --input_session ${{inputs.input_session}} \
        --incremental ${{inputs.incremental}} \
        --neo4j_environment ${{inputs.neo4j_environment}} \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables=environment_variables,
    compute="cpu-cluster",
    is_deterministic=False
)

print("✓ Step 3 component defined")

In [None]:
# Step 4: Recommendations Component
recommendations_component = command(
    name="recommendations",
    display_name="Step 4: Recommendations",
    description="Generate session recommendations for visitors",
    inputs={
        "config_type": Input(type="string", default="vet_bva"),
        "embeddings_ready": Input(type="uri_folder", optional=True),
        "input_registration": Input(type="uri_folder"),
        "input_scan": Input(type="uri_folder"),
        "input_session": Input(type="uri_folder"),
        "incremental": Input(type="string", default="false"),
        "input_data_uri": Input(type="string", optional=True),
        "neo4j_environment": Input(type="string", default=""),
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step4_recommendations.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --input_registration ${{inputs.input_registration}} \
        --input_scan ${{inputs.input_scan}} \
        --input_session ${{inputs.input_session}} \
        --incremental ${{inputs.incremental}} \
        $[[--input_data_uri ${{inputs.input_data_uri}}]] \
        --neo4j_environment ${{inputs.neo4j_environment}} \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables=environment_variables,
    compute=recommendations_compute_target,
    is_deterministic=False
)

print("✓ Step 4 component defined")

In [None]:
# Define the complete pipeline with explicit step chaining (Step1 -> Step2 -> Step3 -> Step4)
@pipeline(
    compute="cpu-cluster",
    description="Complete Personal Agendas pipeline with 4 ordered steps",
)
def personal_agendas_complete_pipeline(
    pipeline_input_data: Input,
    pipeline_config_type: str = "vet_bva",
    pipeline_incremental: str = "false",
    pipeline_input_uri: str = "",
    pipeline_neo4j_environment: str = "",
):
    """Complete Personal Agendas pipeline."""

    step1 = data_preparation_component(
        input_uri=pipeline_input_data,
        config_type=pipeline_config_type,
        incremental=pipeline_incremental
    )
    step1.name = "step1_data_preparation"

    step2 = neo4j_preparation_component(
        config_type=pipeline_config_type,
        input_registration=step1.outputs.registration_output,
        input_scan=step1.outputs.scan_output,
        input_session=step1.outputs.session_output,
        incremental=pipeline_incremental,
        neo4j_environment=pipeline_neo4j_environment,
    )
    step2.name = "step2_neo4j_preparation"

    step3 = session_embedding_component(
        config_type=pipeline_config_type,
        neo4j_ready=step2.outputs.metadata_output,
        input_registration=step1.outputs.registration_output,
        input_scan=step1.outputs.scan_output,
        input_session=step1.outputs.session_output,
        incremental=pipeline_incremental,
        neo4j_environment=pipeline_neo4j_environment,
    )
    step3.name = "step3_session_embedding"

    step4 = recommendations_component(
        config_type=pipeline_config_type,
        embeddings_ready=step3.outputs.metadata_output,
        input_registration=step1.outputs.registration_output,
        input_scan=step1.outputs.scan_output,
        input_session=step1.outputs.session_output,
        incremental=pipeline_incremental,
        input_data_uri=pipeline_input_uri,
        neo4j_environment=pipeline_neo4j_environment,
    )
    step4.name = "step4_recommendations"

    return {
        "registration_data": step1.outputs.registration_output,
        "scan_data": step1.outputs.scan_output,
        "session_data": step1.outputs.session_output,
        "step1_metadata": step1.outputs.metadata_output,
        "neo4j_metadata": step2.outputs.metadata_output,
        "embedding_metadata": step3.outputs.metadata_output,
        "recommendations_metadata": step4.outputs.metadata_output
    }

print("✓ Complete pipeline defined")

In [None]:
# Configure input data URI
input_data_uri = f"azureml://subscriptions/{subscription_id}/resourcegroups/{resource_group}/workspaces/{workspace_name}/datastores/landing_pa/paths/landing/azureml/data/bva"

print(f"Input data URI: {input_data_uri}")

In [None]:
from typing import Dict, Any

In [None]:
# Create pipeline instance
pipeline_incremental_value = "false"
pipeline_neo4j_environment_value = os.getenv("PA_NEO4J_ENVIRONMENT", "")
pipeline_job = personal_agendas_complete_pipeline(
    pipeline_input_data=Input(
        type=AssetTypes.URI_FOLDER,
        path=input_data_uri
    ),
    pipeline_config_type="vet_bva",
    pipeline_incremental=pipeline_incremental_value,
    pipeline_input_uri=input_data_uri,
    pipeline_neo4j_environment=pipeline_neo4j_environment_value,
)

config_preview: Dict[str, Any] = {}
config_path = Path("PA") / "config" / "config_vet_bva.yaml"
if config_path.exists():
    with open(config_path, "r", encoding="utf-8") as config_file:
        config_preview = yaml.safe_load(config_file) or {}
resolved_neo4j_env = (
    pipeline_neo4j_environment_value
    or (config_preview.get('neo4j', {}) or {}).get('environment', '')
    or "unspecified"
)

pipeline_job.display_name = "Personal Agendas Complete Pipeline - BVA"
pipeline_job.tags = {
    "project": "personal_agendas",
    "event_type": "bva",
    "environment": resolved_neo4j_env,
    "includes_neo4j": "true",
    "includes_embeddings": "true",
    "includes_recommendations": "true",
    "incremental": pipeline_incremental_value,
    "step_count": "4",
    "complete_pipeline": "true",
    "neo4j_database": resolved_neo4j_env,
}
pipeline_job.experiment_name = "personal_agendas_complete_experiment_bva"

print(f"✓ Pipeline instance created NEO4J ENV: {pipeline_neo4j_environment_value}")

In [None]:
# Submit the pipeline
print("\nSubmitting pipeline...")
print("="*60)

try:
    pipeline_run = ml_client.jobs.create_or_update(pipeline_job)
    
    print("✓ Pipeline submitted successfully!")
    print("\nPipeline Details:")
    print(f"  Name: {pipeline_run.name}")
    print(f"  Display Name: {pipeline_run.display_name}")
    print(f"  Status: {pipeline_run.status}")
    print(f"  Experiment: {pipeline_run.experiment_name}")
    print("\n0001F517 View pipeline in Azure ML Studio:")
    print(f"  {pipeline_run.studio_url}")
    
except Exception as e:
    print(f"❌ Error submitting pipeline: {str(e)}")
    raise

In [None]:
# Optional: Monitor pipeline progress
import time

print("\nMonitoring pipeline progress...")
print("(Press Ctrl+C to stop monitoring)\n")

try:
    while True:
        job = ml_client.jobs.get(pipeline_run.name)
        
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Status: {job.status}", end="")
        
        if job.status in ["Completed", "Failed", "Canceled"]:
            print(f"\n\nPipeline {job.status}!")
            if job.status == "Failed":
                print("Check the logs in Azure ML Studio for error details.")
            break
        
        print(" (waiting...)", end="
")
        time.sleep(30)
        
except KeyboardInterrupt:
    print("\n\nMonitoring stopped. Pipeline continues running in Azure ML.")
    print(f"Check status at: {pipeline_run.studio_url}")

## Pipeline Summary (BVA)

This run targets the BVA/LVA veterinary flavour (personal agendas mode) using `config_vet_bva.yaml`, the `landing/azureml/data/bva` blobs, and the Neo4j PROD graph.

1. **Data Preparation** – cleans + normalizes registration, demographic, scan, and session exports
2. **Neo4j Preparation** – refreshes visitors, sessions, job/specialization streams, and relationships
3. **Session Embedding** – produces text embeddings for sessions
4. **Recommendations** – writes enriched CSV/JSON exports with vet-specific filtering and control-group logic

Next steps:
- Track the Azure ML job until completion
- Inspect outputs under the workspace datastore + `recommendations` path
- Validate Neo4j PROD has the expected visitor/session graph before sharing files