# Personal Agendas Complete Pipeline with 4 Steps
This notebook submits the complete Personal Agendas pipeline including:
- Step 1: Data Preparation (Registration, Scan, Session)
- Step 2: Neo4j Preparation (Visitors, Sessions, Relationships)
- Step 3: Session Embedding
- Step 4: Recommendations


In [1]:
# Import required libraries
import os
import sys
import json
from pathlib import Path
from datetime import datetime

# Azure ML imports
from azure.ai.ml import MLClient, command, Input, Output
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.entities import Environment, AmlCompute
from azure.ai.ml.dsl import pipeline
from azure.identity import DefaultAzureCredential
from azure.identity import ClientSecretCredential

from dotenv import load_dotenv

# Load environment variables
load_dotenv()

print("="*60)
print("AZURE ML PIPELINE COMPLETE")
print("="*60)

# Configuration
client_id = os.getenv("AZURE_CLIENT_ID")
client_secret = os.getenv("AZURE_CLIENT_SECRET")
tenant_id = os.getenv("AZURE_TENANT_ID")
subscription_id = os.getenv("SUBSCRIPTION_ID")
resource_group = os.getenv("RESOURCE_GROUP")
workspace_name = os.getenv("AZUREML_WORKSPACE_NAME")



print(f"Azure ML SDK Version: {sys.version}")
print(f"Current time: {datetime.now().isoformat()}")

AZURE ML PIPELINE COMPLETE
Azure ML SDK Version: 3.10.18 (main, Jun  5 2025, 13:14:17) [GCC 11.2.0]
Current time: 2025-09-04T15:43:09.528688


In [2]:
# IMPORTANT: Configure Neo4j credentials
# Option 1: Load from .env file
load_dotenv()

# Option 2: Set directly here (replace with your actual credentials)
# Uncomment and update these lines:
# os.environ["NEO4J_URI"] = "neo4j+s://your-instance.databases.neo4j.io"
# os.environ["NEO4J_USERNAME"] = "neo4j"
# os.environ["NEO4J_PASSWORD"] = "your-password"

neo4j_uri = os.getenv("NEO4J_URI")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")

if not all([neo4j_uri, neo4j_username, neo4j_password]):
    print("WARNING: Neo4j credentials not found")
    print("Please set NEO4J_URI, NEO4J_USERNAME, and NEO4J_PASSWORD")
else:
    print("Neo4j credentials configured")

Neo4j credentials configured


In [3]:
databricks_token = os.getenv("DATABRICKS_TOKEN")
databricks_host = os.getenv("DATABRICKS_HOST")
mlflow_tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow_registry_uri = os.getenv("MLFLOW_REGISTRY_URI")
mlflow_experiment_id = os.getenv("MLFLOW_EXPERIMENT_ID")

if not all([databricks_token, databricks_host, mlflow_tracking_uri, mlflow_registry_uri,mlflow_experiment_id]):
    print("WARNING: Databricks / MLFLOW  credentials not found")
    print("Please set DATABRICKS_TOKEN, DATABRICKS_HOST, MLFLOW_TRACKING_URI, MLFLOW_REGISTRY_URI and MLFLOW_EXPERIMENT_ID")
else:
    print("Databricks / MLFLOW credentials configured")

Databricks / MLFLOW credentials configured


In [4]:
# Create ML Client
credential = ClientSecretCredential(
    tenant_id=tenant_id,
    client_id=client_id,
    client_secret=client_secret
)

ml_client = MLClient(
    credential,
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name
)

print(f"Connected to workspace: {workspace_name}")

Connected to workspace: strategicai-mlw-uks-dev-01


In [5]:
# Verify project structure
current_dir = Path.cwd()
if current_dir.name == "notebooks":
    project_root = current_dir.parent
    print(f"Running from notebooks folder")
else:
    project_root = current_dir
    print(f"Running from project root")

print(f"Project root: {project_root}")

# Check required directories
pa_dir = project_root / "PA"
if not pa_dir.exists():
    print(f"ERROR: PA directory not found at {pa_dir}")
else:
    print(f"✓ PA directory found")

pipeline_dir = project_root / "azureml_pipeline"
if not pipeline_dir.exists():
    print(f"Creating azureml_pipeline directory")
    pipeline_dir.mkdir(exist_ok=True)
else:
    print(f"✓ azureml_pipeline directory found")

# Check for all step scripts
step_scripts = {
    "Step 1": pipeline_dir / "azureml_step1_data_prep.py",
    "Step 2": pipeline_dir / "azureml_step2_neo4j_prep.py",
    "Step 3": pipeline_dir / "azureml_step3_session_embedding.py",
    "Step 4": pipeline_dir / "azureml_step4_recommendations.py"
}

for step_name, script_path in step_scripts.items():
    if script_path.exists():
        print(f"✓ {step_name} script found")
    else:
        print(f"WARNING: {step_name} script not found at {script_path}")

# Check config files
config_vet = pa_dir / "config" / "config_vet.yaml"
config_ecomm = pa_dir / "config" / "config_ecomm.yaml"

if config_ecomm.exists():
    print(f"✓ config_ecomm.yaml found")
if config_vet.exists():
    print(f"✓ config_vet.yaml found")

Running from notebooks folder
Project root: /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Notebooks/repos/azure_ml_pipelines_pa
✓ PA directory found
✓ azureml_pipeline directory found
✓ Step 1 script found
✓ Step 2 script found
✓ Step 3 script found
✓ Step 4 script found
✓ config_ecomm.yaml found
✓ config_vet.yaml found


In [19]:
# Environment setup
dependencies_dir = "./env"
custom_env_name = "pa-env"
try:
    job_env = ml_client.environments.get(name=custom_env_name, version="7")
    print(f"Using existing environment: {custom_env_name}:{job_env.version}")
except:
    # Create environment if it doesn't exist
    conda_file = Path(os.path.join(dependencies_dir, "conda.yaml"))
    if not conda_file.exists():
        print(f"WARNING: conda.yaml not found at {conda_file}")
        print("Using default conda configuration")
    
    job_env = Environment(
        name=custom_env_name,
        description="Environment for Personal Agendas pipeline with Neo4j and embeddings",
        conda_file=str(conda_file) if conda_file.exists() else None,
        image="mcr.microsoft.com/azureml/openmpi5.0-ubuntu24.04:20250601.v1"
    )
    job_env = ml_client.environments.create_or_update(job_env)
    print(f"Created environment: {custom_env_name}:{job_env.version}")

ActivityCompleted: Activity=Environment.Get, HowEnded=Failure, Duration=208.8 [ms], Exception=ResourceNotFoundError, ErrorCategory=UserError, ErrorMessage=(UserError) No environment exists for name: pa-env, version: 7, label: 
Code: UserError
Message: No environment exists for name: pa-env, version: 7, label: 
ActivityCompleted: Activity=Environment.CreateOrUpdate, HowEnded=Failure, Duration=184.11 [ms], Exception=ResourceExistsError, ErrorCategory=UserError, ErrorMessage=(UserError) Environment pa-env with version 6 is already registered and cannot be changed.
Code: UserError
Message: Environment pa-env with version 6 is already registered and cannot be changed.


Created environment: pa-env:7


In [20]:
# Step 1: Data Preparation Component
data_preparation_component = command(
    name="data_preparation",
    display_name="Step 1: Data Preparation",
    description="Process registration, scan, and session data",
    inputs={
        "input_uri": Input(type="uri_folder"),
        "config_type": Input(type="string", default="ecomm")
    },
    outputs={
        "registration_output": Output(type="uri_folder"),
        "scan_output": Output(type="uri_folder"),
        "session_output": Output(type="uri_folder"),
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step1_data_prep.py \\
        --config PA/config/config_${{inputs.config_type}}.yaml \\
        --input_uri ${{inputs.input_uri}} \\
        --output_registration ${{outputs.registration_output}} \\
        --output_scan ${{outputs.scan_output}} \\
        --output_session ${{outputs.session_output}} \\
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

In [21]:
# Step 2: Neo4j Preparation Component

neo4j_preparation_component = command(
    name="neo4j_preparation",
    display_name="Step 2: Neo4J Preparation",
    description="Upload data to Neo4j database",
    inputs={
        "config_type": Input(type="string", default="ecomm"),
        "input_registration": Input(type="uri_folder"),
        "input_scan": Input(type="uri_folder"),
        "input_session": Input(type="uri_folder")
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step2_neo4j_prep.py \\
        --config PA/config/config_${{inputs.config_type}}.yaml \\
        --input_registration ${{inputs.input_registration}} \\
        --input_scan ${{inputs.input_scan}} \\
        --input_session ${{inputs.input_session}} \\
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev",
        "NEO4J_URI": neo4j_uri or "neo4j+s://your-uri",
        "NEO4J_USERNAME": neo4j_username or "neo4j",
        "NEO4J_PASSWORD": neo4j_password or "password"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

print("✓ Step 2 component defined")

✓ Step 2 component defined


In [22]:
# Step 3: Session Embedding Component
session_embedding_component = command(
    name="session_embedding",
    display_name="Step 3: Session Embedding",
    description="Generate and store text embeddings for session nodes in Neo4j",
    inputs={
        "config_type": Input(type="string", default="ecomm"),
        "neo4j_ready": Input(type="uri_folder", optional=True), 
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step3_session_embedding.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev",
        "NEO4J_URI": neo4j_uri or "neo4j+s://your-uri",
        "NEO4J_USERNAME": neo4j_username or "neo4j",
        "NEO4J_PASSWORD": neo4j_password or "password"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

print("✓ Step 3 component defined")

✓ Step 3 component defined


In [23]:
# Step 4: Recommendations Component
recommendations_component = command(
    name="recommendations",
    display_name="Step 4: Recommendations",
    description="Generate session recommendations for visitors",
    inputs={
        "config_type": Input(type="string", default="ecomm"),
        "embeddings_ready": Input(type="uri_folder", optional=True),
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step4_recommendations.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev",
        "NEO4J_URI": neo4j_uri,
        "NEO4J_USERNAME": neo4j_username,
        "NEO4J_PASSWORD": neo4j_password or "",
        "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", ""),
        # MLflow/Databricks environment variables
        "DATABRICKS_TOKEN": databricks_token or "",
        "DATABRICKS_HOST": databricks_host or "",
        "MLFLOW_TRACKING_URI": mlflow_tracking_uri or "",
        "MLFLOW_REGISTRY_URI": mlflow_registry_uri or "",
        "MLFLOW_EXPERIMENT_ID": mlflow_experiment_id or ""
    },
    compute="cpu-cluster",
    is_deterministic=False
)

print("✓ Step 4 component defined")

✓ Step 4 component defined


In [24]:
# Define the complete pipeline
@pipeline(
    compute="cpu-cluster",
    description="Complete Personal Agendas pipeline with 4 steps",
)
def personal_agendas_complete_pipeline(
    pipeline_input_data: Input,
    pipeline_config_type: str = "ecomm"
):
    """Complete Personal Agendas pipeline."""
    
    # Step 1: Data Preparation
    step1 = data_preparation_component(
        input_uri=pipeline_input_data,
        config_type=pipeline_config_type
    )
    step1.name = "step1_data_preparation"
    
    # Step 2: Neo4J Preparation - uses outputs from Step 1
    step2 = neo4j_preparation_component(
        config_type=pipeline_config_type,
        input_registration=step1.outputs.registration_output,
        input_scan=step1.outputs.scan_output,
        input_session=step1.outputs.session_output
    )
    step2.name = "step2_neo4j_preparation"
    
    # Step 3: Session Embedding - runs after Neo4j data is loaded
    # It reads directly from Neo4j, so doesn't need file inputs
    step3 = session_embedding_component(
    config_type=pipeline_config_type,
    neo4j_ready=step2.outputs.metadata_output  # This creates the dependency
)
    step3.name = "step3_session_embedding"
    
    # Step 4: Recommendations
    step4 = recommendations_component(
        config_type=pipeline_config_type,
        embeddings_ready=step3.outputs.metadata_output
    )
    step4.name = "step4_recommendations"
    
    # Return all outputs
    return {
        "registration_data": step1.outputs.registration_output,
        "scan_data": step1.outputs.scan_output,
        "session_data": step1.outputs.session_output,
        "step1_metadata": step1.outputs.metadata_output,
        "neo4j_metadata": step2.outputs.metadata_output,
        "embedding_metadata": step3.outputs.metadata_output,
        "recommendations_metadata": step4.outputs.metadata_output
    }

print("✓ Complete pipeline defined")

✓ Complete pipeline defined


In [25]:
# Configure input data URI
input_data_uri = f"azureml://subscriptions/{subscription_id}/resourcegroups/{resource_group}/workspaces/{workspace_name}/datastores/landing_pa/paths/landing/azureml/"

print(f"Input data URI: {input_data_uri}")

Input data URI: azureml://subscriptions/b8d6d487-0bd2-4773-b318-12ab763ed178/resourcegroups/strategicai-rg-uks-dev-01/workspaces/strategicai-mlw-uks-dev-01/datastores/landing_pa/paths/landing/azureml/


In [26]:
# Create pipeline instance
pipeline_job = personal_agendas_complete_pipeline(
    pipeline_input_data=Input(
        type=AssetTypes.URI_FOLDER,
        path=input_data_uri
    ),
    pipeline_config_type="ecomm"  # or "vet"
)

# Configure pipeline metadata
pipeline_job.display_name = "Personal Agendas Complete Pipeline - ECOMM"
pipeline_job.tags = {
    "project": "personal_agendas",
    "event_type": "ecomm",
    "environment": "dev",
    "includes_neo4j": "true",
    "includes_embeddings": "true",
    "includes_recommendations": "true",
    "incremental": "false",
    "step_count": "4",
    "complete_pipeline": "true"
}
pipeline_job.experiment_name = "personal_agendas_complete_experiment"

print("✓ Pipeline instance created")

✓ Pipeline instance created


In [27]:
# Submit the pipeline
print("\nSubmitting pipeline...")
print("="*60)

try:
    # Submit the pipeline job
    pipeline_run = ml_client.jobs.create_or_update(pipeline_job)
    
    print(f"✓ Pipeline submitted successfully!")
    print(f"\nPipeline Details:")
    print(f"  Name: {pipeline_run.name}")
    print(f"  Display Name: {pipeline_run.display_name}")
    print(f"  Status: {pipeline_run.status}")
    print(f"  Experiment: {pipeline_run.experiment_name}")
    print(f"\n🔗 View pipeline in Azure ML Studio:")
    print(f"  {pipeline_run.studio_url}")
    
except Exception as e:
    print(f"❌ Error submitting pipeline: {str(e)}")
    raise


Submitting pipeline...


[32mUploading azure_ml_pipelines_pa (1.45 MBs): 100%|██████████| 1451739/1451739 [00:00<00:00, 1771596.23it/s]
[39m

pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v

✓ Pipeline submitted successfully!

Pipeline Details:
  Name: keen_spade_f8j3htdrks
  Display Name: Personal Agendas Complete Pipeline - ECOMM
  Status: NotStarted
  Experiment: personal_agendas_complete_experiment

🔗 View pipeline in Azure ML Studio:
  https://ml.azure.com/runs/keen_spade_f8j3htdrks?wsid=/subscriptions/b8d6d487-0bd2-4773-b318-12ab763ed178/resourcegroups/strategicai-rg-uks-dev-01/workspaces/strategicai-mlw-uks-dev-01&tid=3540e7dc-31b3-4057-9e31-43e9fe938179


In [28]:
# Optional: Monitor pipeline progress
import time

print("\nMonitoring pipeline progress...")
print("(Press Ctrl+C to stop monitoring)\n")

try:
    while True:
        # Get the latest status
        job = ml_client.jobs.get(pipeline_run.name)
        
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Status: {job.status}", end="")
        
        if job.status in ["Completed", "Failed", "Canceled"]:
            print(f"\n\nPipeline {job.status}!")
            if job.status == "Failed":
                print("Check the logs in Azure ML Studio for error details.")
            break
        
        print(" (waiting...)", end="\r")
        time.sleep(30)  # Check every 30 seconds
        
except KeyboardInterrupt:
    print("\n\nMonitoring stopped. Pipeline continues running in Azure ML.")
    print(f"Check status at: {pipeline_run.studio_url}")


Monitoring pipeline progress...
(Press Ctrl+C to stop monitoring)

[17:23:04] Status: Completedwaiting...)..)

Pipeline Completed!


## Pipeline Summary

This notebook submits a complete 4-step Personal Agendas pipeline:

1. **Data Preparation**: Processes registration, scan, and session data
2. **Neo4j Preparation**: Loads data into Neo4j and creates relationships
3. **Session Embedding**: Generates embeddings for sessions
4. **Recommendations**: Generates personalized session recommendations

### Key Features:
- Fully generic and configurable (works with both `ecomm` and `vet` configurations)
- Proper data flow between steps
- Neo4j integration with secure credential management
- Incremental processing support (can be enabled per step)
- Complete error handling and logging

### Next Steps:
1. Monitor the pipeline execution in Azure ML Studio
2. Check the output metadata for each step
3. Verify recommendations are generated in Neo4j
4. Download the recommendations output file for analysis