# Azure ML Pipeline with Step 1 and Step 2

This notebook creates and submits a pipeline with:
1. Step 1: Data Preparation (Registration, Scan, Session)
2. Step 2: Neo4J Preparation (Visitor, Session, Streams, Relationships)

In [1]:
from azure.ai.ml import MLClient, command, Input, Output
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.entities import Environment
from azure.ai.ml.constants import AssetTypes
from azure.identity import ClientSecretCredential
import os
from dotenv import load_dotenv
from pathlib import Path
import time

# Load environment variables
load_dotenv()

print("="*60)
print("AZURE ML PIPELINE WITH NEO4J STEP")
print("="*60)

# Configuration
client_id = os.getenv("AZURE_CLIENT_ID")
client_secret = os.getenv("AZURE_CLIENT_SECRET")
tenant_id = os.getenv("AZURE_TENANT_ID")
subscription_id = os.getenv("SUBSCRIPTION_ID")
resource_group = os.getenv("RESOURCE_GROUP")
workspace_name = os.getenv("AZUREML_WORKSPACE_NAME")

print("Environment variables loaded")

AZURE ML PIPELINE WITH NEO4J STEP
Environment variables loaded


In [2]:
# IMPORTANT: Configure Neo4j credentials
# Option 1: Load from .env file
load_dotenv()

# Option 2: Set directly here (replace with your actual credentials)
# Uncomment and update these lines:
# os.environ["NEO4J_URI"] = "neo4j+s://your-instance.databases.neo4j.io"
# os.environ["NEO4J_USERNAME"] = "neo4j"
# os.environ["NEO4J_PASSWORD"] = "your-password"

neo4j_uri = os.getenv("NEO4J_URI")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")

if not all([neo4j_uri, neo4j_username, neo4j_password]):
    print("WARNING: Neo4j credentials not found")
    print("Please set NEO4J_URI, NEO4J_USERNAME, and NEO4J_PASSWORD")
else:
    print("Neo4j credentials configured")

Neo4j credentials configured


In [3]:
# Create ML Client
credential = ClientSecretCredential(
    tenant_id=tenant_id,
    client_id=client_id,
    client_secret=client_secret
)

ml_client = MLClient(
    credential,
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name
)

print(f"Connected to workspace: {workspace_name}")

Connected to workspace: strategicai-mlw-uks-dev-01


In [4]:
# Determine project root
current_dir = Path.cwd()
if current_dir.name == "notebooks":
    project_root = current_dir.parent
else:
    project_root = current_dir

print(f"Project root: {project_root}")

Project root: /mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-juan/code/Notebooks/repos/azure_ml_pipelines_pa


In [5]:
# Setup environment
dependencies_dir = "./env"
os.makedirs(dependencies_dir, exist_ok=True)

custom_env_name = "pa-env"
try:
    job_env = ml_client.environments.get(name=custom_env_name, version="5")
    print(f"Using existing environment: {custom_env_name}:5")
except:
    job_env = Environment(
        name=custom_env_name,
        description="Environment for Personal Agendas pipeline with Neo4j",
        conda_file=os.path.join(dependencies_dir, "conda.yaml"),
        image="mcr.microsoft.com/azureml/openmpi5.0-ubuntu24.04:20250601.v1"
    )
    job_env = ml_client.environments.create_or_update(job_env)
    print(f"Created environment: {custom_env_name}:{job_env.version}")

Using existing environment: pa-env:5


In [6]:
# Step 1: Data Preparation Component
data_preparation_component = command(
    name="data_preparation",
    display_name="Step 1: Data Preparation",
    description="Process registration, scan, and session data",
    inputs={
        "input_uri": Input(type="uri_folder"),
        "config_type": Input(type="string", default="ecomm")
    },
    outputs={
        "registration_output": Output(type="uri_folder"),
        "scan_output": Output(type="uri_folder"),
        "session_output": Output(type="uri_folder"),
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step1_data_prep.py \\
        --config PA/config/config_${{inputs.config_type}}.yaml \\
        --input_uri ${{inputs.input_uri}} \\
        --output_registration ${{outputs.registration_output}} \\
        --output_scan ${{outputs.scan_output}} \\
        --output_session ${{outputs.session_output}} \\
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

In [7]:
# Step 2: Neo4J Preparation Component
neo4j_preparation_component = command(
    name="neo4j_preparation",
    display_name="Step 2: Neo4J Preparation",
    description="Upload data to Neo4j database",
    inputs={
        "config_type": Input(type="string", default="ecomm"),
        "input_registration": Input(type="uri_folder"),
        "input_scan": Input(type="uri_folder"),
        "input_session": Input(type="uri_folder")
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step2_neo4j_prep.py \\
        --config PA/config/config_${{inputs.config_type}}.yaml \\
        --input_registration ${{inputs.input_registration}} \\
        --input_scan ${{inputs.input_scan}} \\
        --input_session ${{inputs.input_session}} \\
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{job_env.name}:{job_env.version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev",
        "NEO4J_URI": neo4j_uri or "neo4j+s://your-uri",
        "NEO4J_USERNAME": neo4j_username or "neo4j",
        "NEO4J_PASSWORD": neo4j_password or "password"
    },
    compute="cpu-cluster",
    is_deterministic=False
)

In [8]:
# Define the Pipeline
@pipeline(
    compute="cpu-cluster",
    description="Personal Agendas pipeline with Neo4j"
)
def personal_agendas_pipeline(
    pipeline_input_data: Input,
    pipeline_config_type: str = "ecomm"
):
    # Step 1
    step1 = data_preparation_component(
        input_uri=pipeline_input_data,
        config_type=pipeline_config_type
    )
    
    # Step 2 - uses outputs from Step 1
    step2 = neo4j_preparation_component(
        config_type=pipeline_config_type,
        input_registration=step1.outputs.registration_output,
        input_scan=step1.outputs.scan_output,
        input_session=step1.outputs.session_output
    )
    
    return {
        "registration_data": step1.outputs.registration_output,
        "scan_data": step1.outputs.scan_output,
        "session_data": step1.outputs.session_output,
        "step1_metadata": step1.outputs.metadata_output,
        "neo4j_metadata": step2.outputs.metadata_output
    }

In [9]:
# Configure and submit pipeline
input_data_uri = f"azureml://subscriptions/{subscription_id}/resourcegroups/{resource_group}/workspaces/{workspace_name}/datastores/landing_pa/paths/landing/azureml/"

pipeline_job = personal_agendas_pipeline(
    pipeline_input_data=Input(
        type=AssetTypes.URI_FOLDER,
        path=input_data_uri
    ),
    pipeline_config_type="ecomm"
)

pipeline_job.display_name = "PA Pipeline with Neo4j - ECOMM"
pipeline_job.tags = {
    "project": "personal_agendas",
    "event_type": "ecomm",
    "includes_neo4j": "true"
}
pipeline_job.experiment_name = "personal_agendas_neo4j_experiment"

In [10]:
# Submit pipeline
print("\n" + "="*60)
print("SUBMITTING PIPELINE WITH NEO4J")
print("="*60)

try:
    submitted_job = ml_client.jobs.create_or_update(pipeline_job)
    print(f"\nSUCCESS! Pipeline submitted!")
    print(f"Job Name: {submitted_job.name}")
    print(f"Status: {submitted_job.status}")
    print(f"\nMonitor at: {submitted_job.studio_url}")
except Exception as e:
    print(f"\nSubmission failed: {str(e)}")

print("\n" + "="*60)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.



SUBMITTING PIPELINE WITH NEO4J


[32mUploading azure_ml_pipelines_pa (1.23 MBs): 100%|██████████| 1234178/1234178 [00:00<00:00, 1811221.99it/s]
[39m

pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored



SUCCESS! Pipeline submitted!
Job Name: nifty_picture_lvjwlr4wn5
Status: NotStarted

Monitor at: https://ml.azure.com/runs/nifty_picture_lvjwlr4wn5?wsid=/subscriptions/b8d6d487-0bd2-4773-b318-12ab763ed178/resourcegroups/strategicai-rg-uks-dev-01/workspaces/strategicai-mlw-uks-dev-01&tid=3540e7dc-31b3-4057-9e31-43e9fe938179

