# Azure ML Pipeline with Step 1 and Step 2

This notebook creates and submits a pipeline with:
1. Step 1: Data Preparation (Registration, Scan, Session)
2. Step 2: Neo4J Preparation (Visitor, Session, Streams, Relationships)

Based on the working Step 1 pipeline configuration.

In [1]:
from azure.ai.ml import MLClient, command, Input, Output
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.entities import Environment
from azure.ai.ml.constants import AssetTypes
from azure.identity import ClientSecretCredential
import os
from dotenv import load_dotenv
from pathlib import Path
import time

# Load environment variables
load_dotenv()

print("="*60)
print("🚀 AZURE ML PIPELINE WITH NEO4J STEP")
print("="*60)

# Configuration
client_id = os.getenv("AZURE_CLIENT_ID")
client_secret = os.getenv("AZURE_CLIENT_SECRET")
tenant_id = os.getenv("AZURE_TENANT_ID")
subscription_id = os.getenv("SUBSCRIPTION_ID")
resource_group = os.getenv("RESOURCE_GROUP")
workspace_name = os.getenv("AZUREML_WORKSPACE_NAME")

print("✅ Environment variables loaded")
print(f"  • Workspace: {workspace_name}")
print(f"  • Resource Group: {resource_group}")

🚀 AZURE ML PIPELINE WITH NEO4J STEP
✅ Environment variables loaded
  • Workspace: strategicai-mlw-uks-dev-01
  • Resource Group: strategicai-rg-uks-dev-01


In [2]:
# Create credential
credential = ClientSecretCredential(
    tenant_id=tenant_id,
    client_id=client_id,
    client_secret=client_secret
)

# Initialize ML Client
ml_client = MLClient(
    credential,
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name
)

print(f"Connected to workspace: {workspace_name}")

Connected to workspace: strategicai-mlw-uks-dev-01


In [3]:
# Determine project root
current_dir = Path.cwd()
if current_dir.name == "notebooks":
    project_root = current_dir.parent
else:
    project_root = current_dir

print(f"Project root: {project_root}")

Project root: /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Notebooks/repos/azure_ml_pipelines_pa


In [4]:
# Setup environment directory
dependencies_dir = "./env"
os.makedirs(dependencies_dir, exist_ok=True)

# Create or get environment
custom_env_name = "pa-env"
try:
    # Try to get existing environment version 5
    job_env = ml_client.environments.get(name=custom_env_name, version="5")
    env_version = job_env.version
    print(f"Using existing environment: {custom_env_name}:{env_version}")
except:
    # Create new environment if not found
    job_env = Environment(
        name=custom_env_name,
        description="Environment for Personal Agendas pipeline with Neo4j support",
        conda_file=os.path.join(dependencies_dir, "conda.yaml"),
        image="mcr.microsoft.com/azureml/openmpi5.0-ubuntu24.04:20250601.v1",
    )
    job_env = ml_client.environments.create_or_update(job_env)
    env_version = job_env.version
    print(f"Created environment: {custom_env_name}:{env_version}")

print(f"Environment {job_env.name} version {job_env.version} ready")

Using existing environment: pa-env:5
Environment pa-env version 5 ready


In [5]:
# Verify local directory structure
from pathlib import Path

current_dir = Path.cwd()
if current_dir.name == "notebooks":
    project_root = current_dir.parent
    print(f"Running from notebooks folder")
else:
    project_root = current_dir
    print(f"Running from project root")

print(f"Project root: {project_root}")

# Check required directories
pa_dir = project_root / "PA"
if not pa_dir.exists():
    print(f"ERROR: PA directory not found at {pa_dir}")
else:
    print(f"✓ PA directory found")

pipeline_dir = project_root / "azureml_pipeline"
if not pipeline_dir.exists():
    print(f"Creating azureml_pipeline directory")
    pipeline_dir.mkdir(exist_ok=True)
else:
    print(f"✓ azureml_pipeline directory found")

# Check for step scripts
step1_script = pipeline_dir / "azureml_step1_data_prep.py"
step2_script = pipeline_dir / "azureml_step2_neo4j_prep.py"

if not step1_script.exists():
    print(f"WARNING: Step 1 script not found at {step1_script}")
else:
    print(f"✓ Step 1 script found")

if not step2_script.exists():
    print(f"WARNING: Step 2 script not found at {step2_script}")
    print(f"  Please ensure azureml_step2_neo4j_prep.py is in azureml_pipeline folder")
else:
    print(f"✓ Step 2 script found")

# Check config files
config_vet = pa_dir / "config" / "config_vet.yaml"
config_ecomm = pa_dir / "config" / "config_ecomm.yaml"

if config_ecomm.exists():
    print(f"✓ config_ecomm.yaml found")
if config_vet.exists():
    print(f"✓ config_vet.yaml found")

Running from notebooks folder
Project root: /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Notebooks/repos/azure_ml_pipelines_pa
✓ PA directory found
✓ azureml_pipeline directory found
✓ Step 1 script found
✓ Step 2 script found
✓ config_ecomm.yaml found
✓ config_vet.yaml found


In [6]:
# Define Step 1: Data Preparation Component (NON-INCREMENTAL)
data_preparation_component = command(
    name="data_preparation",
    display_name="Step 1: Data Preparation",
    description="Process registration, scan, and session data",
    inputs={
        "input_uri": Input(type="uri_folder"),
        "config_type": Input(type="string", default="ecomm")
    },
    outputs={
        "registration_output": Output(type="uri_folder"),
        "scan_output": Output(type="uri_folder"),
        "session_output": Output(type="uri_folder"),
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    # NO --incremental flag for standard processing
    command="""python azureml_pipeline/azureml_step1_data_prep.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --input_uri ${{inputs.input_uri}} \
        --output_registration ${{outputs.registration_output}} \
        --output_scan ${{outputs.scan_output}} \
        --output_session ${{outputs.session_output}} \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

In [7]:
# Define Step 2: Neo4J Preparation Component (NON-INCREMENTAL)
neo4j_preparation_component = command(
    name="neo4j_preparation",
    display_name="Step 2: Neo4J Preparation",
    description="Upload data to Neo4j database - create all nodes and relationships",
    inputs={
        "config_type": Input(type="string", default="ecomm"),
        "input_registration": Input(type="uri_folder"),
        "input_scan": Input(type="uri_folder"),
        "input_session": Input(type="uri_folder")
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    # NO --incremental flag for standard processing (will recreate all nodes)
    command="""python azureml_pipeline/azureml_step2_neo4j_prep.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --input_registration ${{inputs.input_registration}} \
        --input_scan ${{inputs.input_scan}} \
        --input_session ${{inputs.input_session}} \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

### Alternative: Incremental Components

If you need incremental processing, use these component definitions instead:

In [8]:
# OPTIONAL: Incremental version of Step 1
data_preparation_component_incremental = command(
    name="data_preparation_incremental",
    display_name="Step 1: Data Preparation (Incremental)",
    description="Process registration, scan, and session data - incremental mode",
    inputs={
        "input_uri": Input(type="uri_folder"),
        "config_type": Input(type="string", default="ecomm")
    },
    outputs={
        "registration_output": Output(type="uri_folder"),
        "scan_output": Output(type="uri_folder"),
        "session_output": Output(type="uri_folder"),
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step1_data_prep.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --input_uri ${{inputs.input_uri}} \
        --incremental \
        --output_registration ${{outputs.registration_output}} \
        --output_scan ${{outputs.scan_output}} \
        --output_session ${{outputs.session_output}} \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

# OPTIONAL: Incremental version of Step 2
neo4j_preparation_component_incremental = command(
    name="neo4j_preparation_incremental",
    display_name="Step 2: Neo4J Preparation (Incremental)",
    description="Upload data to Neo4j - only add new nodes/relationships",
    inputs={
        "config_type": Input(type="string", default="ecomm"),
        "input_registration": Input(type="uri_folder"),
        "input_scan": Input(type="uri_folder"),
        "input_session": Input(type="uri_folder")
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step2_neo4j_prep.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --incremental \
        --input_registration ${{inputs.input_registration}} \
        --input_scan ${{inputs.input_scan}} \
        --input_session ${{inputs.input_session}} \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

In [9]:
# Define the Pipeline (NON-INCREMENTAL VERSION)
@pipeline(
    compute="cpu-cluster",
    description="Personal Agendas pipeline with Neo4j preparation",
)
def personal_agendas_pipeline(
    pipeline_input_data: Input,
    pipeline_config_type: str = "ecomm"
):
    """
    Personal Agendas Pipeline
    Step 1: Data Preparation (Registration, Scan, Session)
    Step 2: Neo4J Preparation (Upload to database)
    """
    
    # Step 1: Data Preparation
    step1 = data_preparation_component(
        input_uri=pipeline_input_data,
        config_type=pipeline_config_type
    )
    
    # Step 2: Neo4J Preparation - uses outputs from Step 1
    step2 = neo4j_preparation_component(
        config_type=pipeline_config_type,
        input_registration=step1.outputs.registration_output,
        input_scan=step1.outputs.scan_output,
        input_session=step1.outputs.session_output
    )
    
    return {
        "registration_data": step1.outputs.registration_output,
        "scan_data": step1.outputs.scan_output,
        "session_data": step1.outputs.session_output,
        "step1_metadata": step1.outputs.metadata_output,
        "neo4j_metadata": step2.outputs.metadata_output
    }

In [10]:
# Configure input data URI
input_data_uri = f"azureml://subscriptions/{subscription_id}/resourcegroups/{resource_group}/workspaces/{workspace_name}/datastores/landing_pa/paths/landing/azureml/"

print(f"\nInput data URI: {input_data_uri}")


Input data URI: azureml://subscriptions/b8d6d487-0bd2-4773-b318-12ab763ed178/resourcegroups/strategicai-rg-uks-dev-01/workspaces/strategicai-mlw-uks-dev-01/datastores/landing_pa/paths/landing/azureml/


In [11]:
# Create pipeline instance
pipeline_job = personal_agendas_pipeline(
    pipeline_input_data=Input(
        type=AssetTypes.URI_FOLDER,
        path=input_data_uri
    ),
    pipeline_config_type="ecomm"
)

# Configure pipeline metadata
pipeline_job.display_name = "Personal Agendas Pipeline with Neo4j - ECOMM"
pipeline_job.tags = {
    "project": "personal_agendas",
    "event_type": "ecomm",
    "storage_fixed": "true",
    "environment": "dev",
    "includes_neo4j": "true",
    "incremental": "false"
}
pipeline_job.experiment_name = "personal_agendas_neo4j_experiment"

In [12]:
# Submit the pipeline
print("\n" + "="*60)
print("SUBMITTING PIPELINE WITH NEO4J")
print("="*60)

try:
    submitted_job = ml_client.jobs.create_or_update(pipeline_job)
    
    print(f"\n✅ SUCCESS! Pipeline submitted!")
    print(f"\n📊 Job Details:")
    print(f"  • Name: {submitted_job.name}")
    print(f"  • Status: {submitted_job.status}")
    print(f"  • Type: {submitted_job.type}")
    print(f"\n🔗 Monitor your pipeline at:")
    print(f"  {submitted_job.studio_url}")
    
    print("\n📋 Pipeline Steps:")
    print("  Step 1: Data Preparation")
    print("    - Registration processing")
    print("    - Scan processing")
    print("    - Session processing")
    print("  Step 2: Neo4J Preparation")
    print("    - Visitor nodes (this year, last year BVA/LVA)")
    print("    - Session nodes and stream relationships")
    print("    - Job to stream mappings")
    print("    - Specialization to stream mappings")
    print("    - Cross-year visitor relationships")
    
    print("\n💡 Tips:")
    print("  • Click the link above to watch pipeline progress")
    print("  • Check 'Outputs + logs' for detailed execution logs")
    print("  • Neo4j credentials loaded from Key Vault")
    print("  • The first run may take longer due to environment setup")
    
except Exception as e:
    print(f"\n❌ Submission failed: {str(e)}")
    print("\n🔧 Troubleshooting:")
    
    if "AuthorizationFailure" in str(e):
        print("  Storage authorization issue. Try:")
        print(f"  1. Run: az ml workspace sync-keys --resource-group {resource_group} --workspace-name {workspace_name}")
        print("  2. Wait 2 minutes and retry")
    elif "compute" in str(e).lower():
        print("  Compute cluster issue. Check:")
        print("  1. Cluster 'cpu-cluster' exists")
        print("  2. Cluster is running (not stopped)")
        print("  3. You have permissions to use it")
    elif "environment" in str(e).lower():
        print("  Environment issue. Ensure conda.yaml includes:")
        print("  - neo4j>=5.0.0")
        print("  - All other required packages")
    else:
        print("  Check the error message above for details")
        print("  Verify all prerequisites are met")

print("\n" + "="*60)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.



SUBMITTING PIPELINE WITH NEO4J


[32mUploading azure_ml_pipelines_pa (1.12 MBs): 100%|██████████| 1116165/1116165 [00:00<00:00, 1801987.74it/s]
[39m

pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored



✅ SUCCESS! Pipeline submitted!

📊 Job Details:
  • Name: heroic_answer_nnt8bpnm2m
  • Status: NotStarted
  • Type: pipeline

🔗 Monitor your pipeline at:
  https://ml.azure.com/runs/heroic_answer_nnt8bpnm2m?wsid=/subscriptions/b8d6d487-0bd2-4773-b318-12ab763ed178/resourcegroups/strategicai-rg-uks-dev-01/workspaces/strategicai-mlw-uks-dev-01&tid=3540e7dc-31b3-4057-9e31-43e9fe938179

📋 Pipeline Steps:
  Step 1: Data Preparation
    - Registration processing
    - Scan processing
    - Session processing
  Step 2: Neo4J Preparation
    - Visitor nodes (this year, last year BVA/LVA)
    - Session nodes and stream relationships
    - Job to stream mappings
    - Specialization to stream mappings
    - Cross-year visitor relationships

💡 Tips:
  • Click the link above to watch pipeline progress
  • Check 'Outputs + logs' for detailed execution logs
  • Neo4j credentials loaded from Key Vault
  • The first run may take longer due to environment setup



## Monitor Pipeline Status

In [13]:
# Optional: Check pipeline status after submission
if 'submitted_job' in locals():
    print("Checking pipeline status...")
    time.sleep(5)  # Wait a moment for job to initialize
    
    job = ml_client.jobs.get(submitted_job.name)
    print(f"\nPipeline: {job.name}")
    print(f"Status: {job.status}")
    print(f"\nTo view detailed logs and outputs:")
    print(f"  {job.studio_url}")

Checking pipeline status...

Pipeline: heroic_answer_nnt8bpnm2m
Status: Running

To view detailed logs and outputs:
  https://ml.azure.com/runs/heroic_answer_nnt8bpnm2m?wsid=/subscriptions/b8d6d487-0bd2-4773-b318-12ab763ed178/resourcegroups/strategicai-rg-uks-dev-01/workspaces/strategicai-mlw-uks-dev-01&tid=3540e7dc-31b3-4057-9e31-43e9fe938179
