# Personal Agendas Pipeline with Session Embedding - Step 3

This notebook submits the Personal Agendas pipeline with 3 steps:
1. **Data Preparation**: Registration, Scan, Session processing
2. **Neo4J Preparation**: Create nodes and relationships in Neo4j
3. **Session Embedding**: Generate and store embeddings for sessions

## Prerequisites
- Azure ML workspace connection
- Compute cluster named 'cpu-cluster'
- PA code repository with config files
- Neo4j database connection (credentials in Key Vault)

In [5]:
# Import required libraries
from azure.ai.ml import MLClient, command, Input, Output
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.entities import Environment, AmlCompute
from azure.ai.ml.dsl import pipeline
from azure.identity import DefaultAzureCredential
from azure.identity import ClientSecretCredential
from pathlib import Path
import os
import time
from dotenv import load_dotenv
# Load environment variables
load_dotenv()

print("="*60)
print("AZURE ML PIPELINE WITH NEO4J STEP")
print("="*60)

# Configuration
client_id = os.getenv("AZURE_CLIENT_ID")
client_secret = os.getenv("AZURE_CLIENT_SECRET")
tenant_id = os.getenv("AZURE_TENANT_ID")
subscription_id = os.getenv("SUBSCRIPTION_ID")
resource_group = os.getenv("RESOURCE_GROUP")
workspace_name = os.getenv("AZUREML_WORKSPACE_NAME")

print("Environment variables loaded")

AZURE ML PIPELINE WITH NEO4J STEP
Environment variables loaded


In [6]:
# IMPORTANT: Configure Neo4j credentials
# Option 1: Load from .env file
load_dotenv()

# Option 2: Set directly here (replace with your actual credentials)
# Uncomment and update these lines:
# os.environ["NEO4J_URI"] = "neo4j+s://your-instance.databases.neo4j.io"
# os.environ["NEO4J_USERNAME"] = "neo4j"
# os.environ["NEO4J_PASSWORD"] = "your-password"

neo4j_uri = os.getenv("NEO4J_URI")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")

if not all([neo4j_uri, neo4j_username, neo4j_password]):
    print("WARNING: Neo4j credentials not found")
    print("Please set NEO4J_URI, NEO4J_USERNAME, and NEO4J_PASSWORD")
else:
    print("Neo4j credentials configured")

Neo4j credentials configured


In [7]:
# Create ML Client
credential = ClientSecretCredential(
    tenant_id=tenant_id,
    client_id=client_id,
    client_secret=client_secret
)

ml_client = MLClient(
    credential,
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name
)

print(f"Connected to workspace: {workspace_name}")

Connected to workspace: strategicai-mlw-uks-dev-01


In [8]:
# Verify local directory structure
from pathlib import Path

current_dir = Path.cwd()
if current_dir.name == "notebooks":
    project_root = current_dir.parent
    print(f"Running from notebooks folder")
else:
    project_root = current_dir
    print(f"Running from project root")

print(f"Project root: {project_root}")

# Check required directories
pa_dir = project_root / "PA"
if not pa_dir.exists():
    print(f"ERROR: PA directory not found at {pa_dir}")
else:
    print(f"✓ PA directory found")

pipeline_dir = project_root / "azureml_pipeline"
if not pipeline_dir.exists():
    print(f"Creating azureml_pipeline directory")
    pipeline_dir.mkdir(exist_ok=True)
else:
    print(f"✓ azureml_pipeline directory found")

# Check for step scripts
step1_script = pipeline_dir / "azureml_step1_data_prep.py"
step2_script = pipeline_dir / "azureml_step2_neo4j_prep.py"
step3_script = pipeline_dir / "azureml_step3_session_embedding.py"

if not step1_script.exists():
    print(f"WARNING: Step 1 script not found at {step1_script}")
else:
    print(f"✓ Step 1 script found")

if not step2_script.exists():
    print(f"WARNING: Step 2 script not found at {step2_script}")
else:
    print(f"✓ Step 2 script found")
    
if not step3_script.exists():
    print(f"WARNING: Step 3 script not found at {step3_script}")
    print(f"  Please ensure azureml_step3_session_embedding.py is in azureml_pipeline folder")
else:
    print(f"✓ Step 3 script found")

# Check config files
config_vet = pa_dir / "config" / "config_vet.yaml"
config_ecomm = pa_dir / "config" / "config_ecomm.yaml"

if config_ecomm.exists():
    print(f"✓ config_ecomm.yaml found")
if config_vet.exists():
    print(f"✓ config_vet.yaml found")

Running from notebooks folder
Project root: /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Notebooks/repos/azure_ml_pipelines_pa
✓ PA directory found
✓ azureml_pipeline directory found
✓ Step 1 script found
✓ Step 2 script found
✓ Step 3 script found
✓ config_ecomm.yaml found
✓ config_vet.yaml found


In [9]:
# Environment setup
dependencies_dir = project_root / "environment"
custom_env_name = "pa-env"
try:
    job_env = ml_client.environments.get(name=custom_env_name, version="5")
    print(f"Using existing environment: {custom_env_name}:5")
except:
    # Create environment if it doesn't exist
    conda_file = dependencies_dir / "conda.yaml"
    if not conda_file.exists():
        print(f"WARNING: conda.yaml not found at {conda_file}")
        print("Using default conda configuration")
    
    job_env = Environment(
        name=custom_env_name,
        description="Environment for Personal Agendas pipeline with Neo4j and embeddings",
        conda_file=str(conda_file) if conda_file.exists() else None,
        image="mcr.microsoft.com/azureml/openmpi5.0-ubuntu24.04:20250601.v1"
    )
    job_env = ml_client.environments.create_or_update(job_env)
    print(f"Created environment: {custom_env_name}:{job_env.version}")

Using existing environment: pa-env:5


In [10]:
# Define Step 1: Data Preparation Component (NON-INCREMENTAL)
data_preparation_component = command(
    name="data_preparation",
    display_name="Step 1: Data Preparation",
    description="Process registration, scan, and session data",
    inputs={
        "input_uri": Input(type="uri_folder"),
        "config_type": Input(type="string", default="ecomm")
    },
    outputs={
        "registration_output": Output(type="uri_folder"),
        "scan_output": Output(type="uri_folder"),
        "session_output": Output(type="uri_folder"),
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step1_data_prep.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --input_uri ${{inputs.input_uri}} \
        --output_registration ${{outputs.registration_output}} \
        --output_scan ${{outputs.scan_output}} \
        --output_session ${{outputs.session_output}} \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

In [20]:
# Define Step 2: Neo4J Preparation Component (NON-INCREMENTAL)
neo4j_preparation_component = command(
    name="neo4j_preparation",
    display_name="Step 2: Neo4J Preparation",
    description="Upload data to Neo4j database - create all nodes and relationships",
    inputs={
        "config_type": Input(type="string", default="ecomm"),
        "input_registration": Input(type="uri_folder"),
        "input_scan": Input(type="uri_folder"),
        "input_session": Input(type="uri_folder")
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step2_neo4j_prep.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --input_registration ${{inputs.input_registration}} \
        --input_scan ${{inputs.input_scan}} \
        --input_session ${{inputs.input_session}} \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev",
        "NEO4J_URI": neo4j_uri or "neo4j+s://your-uri",
        "NEO4J_USERNAME": neo4j_username or "neo4j",
        "NEO4J_PASSWORD": neo4j_password or "password"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

In [21]:
# Define Step 3: Session Embedding Component (NON-INCREMENTAL)
session_embedding_component = command(
    name="session_embedding",
    display_name="Step 3: Session Embedding",
    description="Generate and store text embeddings for session nodes in Neo4j",
    inputs={
        "config_type": Input(type="string", default="ecomm"),
        "neo4j_ready": Input(type="uri_folder", optional=True), 
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step3_session_embedding.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev",
        "NEO4J_URI": neo4j_uri or "neo4j+s://your-uri",
        "NEO4J_USERNAME": neo4j_username or "neo4j",
        "NEO4J_PASSWORD": neo4j_password or "password"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

In [22]:
# Define INCREMENTAL versions of the components (optional)
session_embedding_component_incremental = command(
    name="session_embedding_incremental",
    display_name="Step 3: Session Embedding (Incremental)",
    description="Generate embeddings only for new sessions without embeddings",
    inputs={
        "config_type": Input(type="string", default="ecomm"),
         "neo4j_ready": Input(type="uri_folder", optional=True), 
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step3_session_embedding.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --incremental \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev",
        "NEO4J_URI": neo4j_uri or "neo4j+s://your-uri",
        "NEO4J_USERNAME": neo4j_username or "neo4j",
        "NEO4J_PASSWORD": neo4j_password or "password"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

In [25]:
# Define the Pipeline with 3 Steps (NON-INCREMENTAL VERSION)
@pipeline(
    compute="cpu-cluster",
    description="Personal Agendas pipeline with Neo4j and session embeddings",
)
def personal_agendas_pipeline_with_embeddings(
    pipeline_input_data: Input,
    pipeline_config_type: str = "ecomm"
):
    """
    Personal Agendas Pipeline
    Step 1: Data Preparation (Registration, Scan, Session)
    Step 2: Neo4J Preparation (Upload to database)
    Step 3: Session Embedding (Generate and store embeddings)
    """
    
    # Step 1: Data Preparation
    step1 = data_preparation_component(
        input_uri=pipeline_input_data,
        config_type=pipeline_config_type
    )
    
    # Step 2: Neo4J Preparation - uses outputs from Step 1
    step2 = neo4j_preparation_component(
        config_type=pipeline_config_type,
        input_registration=step1.outputs.registration_output,
        input_scan=step1.outputs.scan_output,
        input_session=step1.outputs.session_output
    )
    
    # Step 3: Session Embedding - runs after Neo4j data is loaded
    # It reads directly from Neo4j, so doesn't need file inputs
    step3 = session_embedding_component(
    config_type=pipeline_config_type,
    neo4j_ready=step2.outputs.metadata_output  # This creates the dependency
)
    
    return {
        "registration_data": step1.outputs.registration_output,
        "scan_data": step1.outputs.scan_output,
        "session_data": step1.outputs.session_output,
        "step1_metadata": step1.outputs.metadata_output,
        "neo4j_metadata": step2.outputs.metadata_output,
        "embedding_metadata": step3.outputs.metadata_output
    }

In [26]:
# Configure input data URI
input_data_uri = f"azureml://subscriptions/{subscription_id}/resourcegroups/{resource_group}/workspaces/{workspace_name}/datastores/landing_pa/paths/landing/azureml/"

print(f"\nInput data URI: {input_data_uri}")


Input data URI: azureml://subscriptions/b8d6d487-0bd2-4773-b318-12ab763ed178/resourcegroups/strategicai-rg-uks-dev-01/workspaces/strategicai-mlw-uks-dev-01/datastores/landing_pa/paths/landing/azureml/


In [27]:
# Create pipeline instance
pipeline_job = personal_agendas_pipeline_with_embeddings(
    pipeline_input_data=Input(
        type=AssetTypes.URI_FOLDER,
        path=input_data_uri
    ),
    pipeline_config_type="ecomm"  # or "vet"
)

# Configure pipeline metadata
pipeline_job.display_name = "Personal Agendas Pipeline with Embeddings - ECOMM"
pipeline_job.tags = {
    "project": "personal_agendas",
    "event_type": "ecomm",
    "environment": "dev",
    "includes_neo4j": "true",
    "includes_embeddings": "true",
    "incremental": "false",
    "step_count": "3"
}
pipeline_job.experiment_name = "personal_agendas_embedding_experiment"

In [28]:
# Submit the pipeline
print("\n" + "="*60)
print("SUBMITTING PIPELINE WITH SESSION EMBEDDINGS")
print("="*60)

try:
    submitted_job = ml_client.jobs.create_or_update(pipeline_job)
    print(f"\nSUCCESS! Pipeline submitted!")
    print(f"Job Name: {submitted_job.name}")
    print(f"Status: {submitted_job.status}")
    print(f"\nMonitor at: {submitted_job.studio_url}")
except Exception as e:
    print(f"\nSubmission failed: {str(e)}")

print("\n" + "="*60)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.



SUBMITTING PIPELINE WITH SESSION EMBEDDINGS


[32mUploading azure_ml_pipelines_pa (1.34 MBs): 100%|██████████| 1344982/1344982 [00:00<00:00, 1606852.83it/s]
[39m

pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v


SUCCESS! Pipeline submitted!
Job Name: dreamy_neck_0r2hk6kh02
Status: NotStarted

Monitor at: https://ml.azure.com/runs/dreamy_neck_0r2hk6kh02?wsid=/subscriptions/b8d6d487-0bd2-4773-b318-12ab763ed178/resourcegroups/strategicai-rg-uks-dev-01/workspaces/strategicai-mlw-uks-dev-01&tid=3540e7dc-31b3-4057-9e31-43e9fe938179



## Monitor Pipeline Status

In [29]:
# Optional: Check pipeline status after submission
if 'submitted_job' in locals():
    print("Checking pipeline status...")
    time.sleep(5)  # Wait a moment for job to initialize
    
    job = ml_client.jobs.get(submitted_job.name)
    print(f"\nPipeline: {job.name}")
    print(f"Status: {job.status}")
    print(f"\nTo view detailed logs and outputs:")
    print(f"  {job.studio_url}")

Checking pipeline status...

Pipeline: dreamy_neck_0r2hk6kh02
Status: Running

To view detailed logs and outputs:
  https://ml.azure.com/runs/dreamy_neck_0r2hk6kh02?wsid=/subscriptions/b8d6d487-0bd2-4773-b318-12ab763ed178/resourcegroups/strategicai-rg-uks-dev-01/workspaces/strategicai-mlw-uks-dev-01&tid=3540e7dc-31b3-4057-9e31-43e9fe938179


## Alternative: Submit Incremental Pipeline

Use this section to submit an incremental version of the pipeline that only creates new embeddings.

In [14]:
# Define INCREMENTAL Pipeline (only creates new data)
@pipeline(
    compute="cpu-cluster",
    description="Personal Agendas INCREMENTAL pipeline with embeddings",
)
def personal_agendas_pipeline_incremental(
    pipeline_input_data: Input,
    pipeline_config_type: str = "ecomm"
):
    """
    Incremental Personal Agendas Pipeline
    Only processes new data and creates embeddings for sessions without them.
    """
    
    # Note: You would need incremental versions of step 1 and 2 as well
    # For now, using regular step 1 and 2, with incremental embedding step
    
    # Step 1: Data Preparation
    step1 = data_preparation_component(
        input_uri=pipeline_input_data,
        config_type=pipeline_config_type
    )
    
    # Step 2: Neo4J Preparation
    step2 = neo4j_preparation_component(
        config_type=pipeline_config_type,
        input_registration=step1.outputs.registration_output,
        input_scan=step1.outputs.scan_output,
        input_session=step1.outputs.session_output
    )
    
    # Step 3: Session Embedding - INCREMENTAL version
    step3 = session_embedding_component_incremental(
        config_type=pipeline_config_type
    )
    
    # Ensure step 3 runs after step 2
    step3.after(step2)
    
    return {
        "registration_data": step1.outputs.registration_output,
        "scan_data": step1.outputs.scan_output,
        "session_data": step1.outputs.session_output,
        "step1_metadata": step1.outputs.metadata_output,
        "neo4j_metadata": step2.outputs.metadata_output,
        "embedding_metadata": step3.outputs.metadata_output
    }