# Personal Agendas Complete Pipeline with 4 Steps
This notebook submits the complete Personal Agendas pipeline including:
- Step 1: Data Preparation (Registration, Scan, Session)
- Step 2: Neo4j Preparation (Visitors, Sessions, Relationships)
- Step 3: Session Embedding
- Step 4: Recommendations


In [None]:
# Import required libraries
import os
import sys
import json
from pathlib import Path
from datetime import datetime

# Azure ML imports
from azure.ai.ml import MLClient, command, dsl, Input, Output
from azure.ai.ml.entities import Environment, AssetTypes
from azure.ai.ml.dsl import pipeline
from azure.identity import DefaultAzureCredential
from azure.ai.ml.constants import InputOutputModes
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

print(f"Azure ML SDK Version: {sys.version}")
print(f"Current time: {datetime.now().isoformat()}")

In [None]:
# Configure Azure ML workspace
subscription_id = os.getenv("SUBSCRIPTION_ID", "b8d6d487-0bd2-4773-b318-12ab763ed178")
resource_group = os.getenv("RESOURCE_GROUP", "strategicai-rg-uks-dev-01")
workspace_name = os.getenv("AZUREML_WORKSPACE_NAME", "strategicai-mlw-uks-dev-01")

print(f"Subscription ID: {subscription_id}")
print(f"Resource Group: {resource_group}")
print(f"Workspace: {workspace_name}")

# Create ML client
ml_client = MLClient(
    credential=DefaultAzureCredential(),
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name
)

print(f"\nML Client created successfully")
print(f"Workspace location: {ml_client.workspaces.get(workspace_name).location}")

In [None]:
# Setup Neo4j environment variables
neo4j_uri = os.getenv("NEO4J_URI", "neo4j+s://1e87c76e.databases.neo4j.io")
neo4j_username = os.getenv("NEO4J_USERNAME", "neo4j")
neo4j_password = os.getenv("NEO4J_PASSWORD")

if not neo4j_password:
    print("WARNING: NEO4J_PASSWORD not set in environment")
    print("The pipeline will try to get it from Azure Key Vault")
else:
    print("✓ Neo4j credentials configured")
    print(f"Neo4j URI: {neo4j_uri}")
    print(f"Neo4j Username: {neo4j_username}")

In [None]:
# Verify project structure
current_dir = Path.cwd()
if current_dir.name == "notebooks":
    project_root = current_dir.parent
    print(f"Running from notebooks folder")
else:
    project_root = current_dir
    print(f"Running from project root")

print(f"Project root: {project_root}")

# Check required directories
pa_dir = project_root / "PA"
if not pa_dir.exists():
    print(f"ERROR: PA directory not found at {pa_dir}")
else:
    print(f"✓ PA directory found")

pipeline_dir = project_root / "azureml_pipeline"
if not pipeline_dir.exists():
    print(f"Creating azureml_pipeline directory")
    pipeline_dir.mkdir(exist_ok=True)
else:
    print(f"✓ azureml_pipeline directory found")

# Check for all step scripts
step_scripts = {
    "Step 1": pipeline_dir / "azureml_step1_data_prep.py",
    "Step 2": pipeline_dir / "azureml_step2_neo4j_prep.py",
    "Step 3": pipeline_dir / "azureml_step3_session_embedding.py",
    "Step 4": pipeline_dir / "azureml_step4_recommendations.py"
}

for step_name, script_path in step_scripts.items():
    if script_path.exists():
        print(f"✓ {step_name} script found")
    else:
        print(f"WARNING: {step_name} script not found at {script_path}")

# Check config files
config_vet = pa_dir / "config" / "config_vet.yaml"
config_ecomm = pa_dir / "config" / "config_ecomm.yaml"

if config_ecomm.exists():
    print(f"✓ config_ecomm.yaml found")
if config_vet.exists():
    print(f"✓ config_vet.yaml found")

In [None]:
# Environment setup
dependencies_dir = project_root / "environment"
custom_env_dir = project_root / "env"

# Check which environment directory exists
if custom_env_dir.exists() and (custom_env_dir / "conda.yaml").exists():
    conda_file = custom_env_dir / "conda.yaml"
    print(f"Using conda.yaml from: {conda_file}")
elif dependencies_dir.exists() and (dependencies_dir / "conda.yaml").exists():
    conda_file = dependencies_dir / "conda.yaml"
    print(f"Using conda.yaml from: {conda_file}")
else:
    print("WARNING: No conda.yaml found, will use existing environment")
    conda_file = None

# Try to use existing environment first
env_name = "pa-env"
env_version = None

try:
    environments = list(ml_client.environments.list(name=env_name))
    if environments:
        # Use the latest version
        env_version = max([int(e.version) for e in environments if e.version.isdigit()])
        print(f"Using existing environment: {env_name}:{env_version}")
    else:
        print(f"No existing environment found with name: {env_name}")
        if conda_file:
            print("Creating new environment...")
            job_env = Environment(
                name=env_name,
                description="Personal Agendas Pipeline Environment",
                conda_file=str(conda_file),
                image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
            )
            ml_client.environments.create_or_update(job_env)
            env_version = "1"
            print(f"Created environment: {env_name}:{env_version}")
except Exception as e:
    print(f"Error with environment: {e}")
    # Fallback to default
    env_name = "AzureML-sklearn-1.0-ubuntu20.04-py38-cpu"
    env_version = "1"
    print(f"Using default environment: {env_name}:{env_version}")

In [None]:
# Step 1: Data Preparation Component
data_preparation_component = command(
    name="data_preparation",
    display_name="Step 1: Data Preparation",
    description="Process registration, scan, and session data",
    inputs={
        "input_uri": Input(type="uri_folder"),
        "config_type": Input(type="string", default="ecomm")
    },
    outputs={
        "registration_output": Output(type="uri_folder"),
        "scan_output": Output(type="uri_folder"),
        "session_output": Output(type="uri_folder"),
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step1_data_prep.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --input_uri ${{inputs.input_uri}} \
        --output_registration ${{outputs.registration_output}} \
        --output_scan ${{outputs.scan_output}} \
        --output_session ${{outputs.session_output}} \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{env_name}:{env_version}",
    compute="cpu-cluster",
    is_deterministic=False
)

print("✓ Step 1 component defined")

In [None]:
# Step 2: Neo4j Preparation Component
neo4j_preparation_component = command(
    name="neo4j_preparation",
    display_name="Step 2: Neo4j Preparation",
    description="Load data into Neo4j and create relationships",
    inputs={
        "config_type": Input(type="string", default="ecomm"),
        "registration_data": Input(type="uri_folder"),
        "scan_data": Input(type="uri_folder"),
        "session_data": Input(type="uri_folder"),
        "step1_metadata": Input(type="uri_folder")
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step2_neo4j_prep.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --registration_data ${{inputs.registration_data}} \
        --scan_data ${{inputs.scan_data}} \
        --session_data ${{inputs.session_data}} \
        --step1_metadata ${{inputs.step1_metadata}} \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{env_name}:{env_version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev",
        "NEO4J_URI": neo4j_uri,
        "NEO4J_USERNAME": neo4j_username,
        "NEO4J_PASSWORD": neo4j_password or ""
    },
    compute="cpu-cluster",
    is_deterministic=False
)

print("✓ Step 2 component defined")

In [None]:
# Step 3: Session Embedding Component
session_embedding_component = command(
    name="session_embedding",
    display_name="Step 3: Session Embedding",
    description="Generate and store text embeddings for session nodes in Neo4j",
    inputs={
        "config_type": Input(type="string", default="ecomm"),
        "neo4j_ready": Input(type="uri_folder", optional=True),
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step3_session_embedding.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{env_name}:{env_version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev",
        "NEO4J_URI": neo4j_uri,
        "NEO4J_USERNAME": neo4j_username,
        "NEO4J_PASSWORD": neo4j_password or ""
    },
    compute="cpu-cluster",
    is_deterministic=False
)

print("✓ Step 3 component defined")

In [None]:
# Step 4: Recommendations Component
recommendations_component = command(
    name="recommendations",
    display_name="Step 4: Recommendations",
    description="Generate session recommendations for visitors",
    inputs={
        "config_type": Input(type="string", default="ecomm"),
        "embeddings_ready": Input(type="uri_folder", optional=True),
    },
    outputs={
        "metadata_output": Output(type="uri_folder")
    },
    code=str(project_root),
    command="""python azureml_pipeline/azureml_step4_recommendations.py \
        --config PA/config/config_${{inputs.config_type}}.yaml \
        --output_metadata ${{outputs.metadata_output}}
    """,
    environment=f"{env_name}:{env_version}",
    environment_variables={
        "KEYVAULT_NAME": "strategicai-kv-uks-dev",
        "NEO4J_URI": neo4j_uri,
        "NEO4J_USERNAME": neo4j_username,
        "NEO4J_PASSWORD": neo4j_password or "",
        "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", ""),
        # MLflow/Databricks environment variables
        "DATABRICKS_TOKEN": databricks_token or "",
        "DATABRICKS_HOST": databricks_host or "",
        "MLFLOW_TRACKING_URI": mlflow_tracking_uri or "",
        "MLFLOW_REGISTRY_URI": mlflow_registry_uri or "",
        "MLFLOW_EXPERIMENT_ID": mlflow_experiment_id or ""
    },
    compute="cpu-cluster",
    is_deterministic=False
)

print("✓ Step 4 component defined")

In [None]:
# Define the complete pipeline
@pipeline(
    compute="cpu-cluster",
    description="Complete Personal Agendas pipeline with 4 steps",
)
def personal_agendas_complete_pipeline(
    pipeline_input_data: Input,
    pipeline_config_type: str = "ecomm"
):
    """Complete Personal Agendas pipeline."""
    
    # Step 1: Data Preparation
    step1 = data_preparation_component(
        input_uri=pipeline_input_data,
        config_type=pipeline_config_type
    )
    step1.name = "step1_data_preparation"
    
    # Step 2: Neo4j Preparation
    step2 = neo4j_preparation_component(
        config_type=pipeline_config_type,
        registration_data=step1.outputs.registration_output,
        scan_data=step1.outputs.scan_output,
        session_data=step1.outputs.session_output,
        step1_metadata=step1.outputs.metadata_output
    )
    step2.name = "step2_neo4j_preparation"
    
    # Step 3: Session Embedding
    step3 = session_embedding_component(
        config_type=pipeline_config_type,
        neo4j_ready=step2.outputs.metadata_output
    )
    step3.name = "step3_session_embedding"
    
    # Step 4: Recommendations
    step4 = recommendations_component(
        config_type=pipeline_config_type,
        embeddings_ready=step3.outputs.metadata_output
    )
    step4.name = "step4_recommendations"
    
    # Return all outputs
    return {
        "registration_data": step1.outputs.registration_output,
        "scan_data": step1.outputs.scan_output,
        "session_data": step1.outputs.session_output,
        "step1_metadata": step1.outputs.metadata_output,
        "neo4j_metadata": step2.outputs.metadata_output,
        "embedding_metadata": step3.outputs.metadata_output,
        "recommendations_metadata": step4.outputs.metadata_output
    }

print("✓ Complete pipeline defined")

In [None]:
# Configure input data URI
input_data_uri = f"azureml://subscriptions/{subscription_id}/resourcegroups/{resource_group}/workspaces/{workspace_name}/datastores/landing_pa/paths/landing/azureml/"

print(f"Input data URI: {input_data_uri}")

In [None]:
# Create pipeline instance
pipeline_job = personal_agendas_complete_pipeline(
    pipeline_input_data=Input(
        type=AssetTypes.URI_FOLDER,
        path=input_data_uri
    ),
    pipeline_config_type="ecomm"  # or "vet"
)

# Configure pipeline metadata
pipeline_job.display_name = "Personal Agendas Complete Pipeline - ECOMM"
pipeline_job.tags = {
    "project": "personal_agendas",
    "event_type": "ecomm",
    "environment": "dev",
    "includes_neo4j": "true",
    "includes_embeddings": "true",
    "includes_recommendations": "true",
    "incremental": "false",
    "step_count": "4",
    "complete_pipeline": "true"
}
pipeline_job.experiment_name = "personal_agendas_complete_experiment"

print("✓ Pipeline instance created")

In [None]:
# Submit the pipeline
print("\nSubmitting pipeline...")
print("="*60)

try:
    # Submit the pipeline job
    pipeline_run = ml_client.jobs.create_or_update(pipeline_job)
    
    print(f"✓ Pipeline submitted successfully!")
    print(f"\nPipeline Details:")
    print(f"  Name: {pipeline_run.name}")
    print(f"  Display Name: {pipeline_run.display_name}")
    print(f"  Status: {pipeline_run.status}")
    print(f"  Experiment: {pipeline_run.experiment_name}")
    print(f"\n🔗 View pipeline in Azure ML Studio:")
    print(f"  {pipeline_run.studio_url}")
    
except Exception as e:
    print(f"❌ Error submitting pipeline: {str(e)}")
    raise

In [None]:
# Optional: Monitor pipeline progress
import time

print("\nMonitoring pipeline progress...")
print("(Press Ctrl+C to stop monitoring)\n")

try:
    while True:
        # Get the latest status
        job = ml_client.jobs.get(pipeline_run.name)
        
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Status: {job.status}", end="")
        
        if job.status in ["Completed", "Failed", "Canceled"]:
            print(f"\n\nPipeline {job.status}!")
            if job.status == "Failed":
                print("Check the logs in Azure ML Studio for error details.")
            break
        
        print(" (waiting...)", end="\r")
        time.sleep(30)  # Check every 30 seconds
        
except KeyboardInterrupt:
    print("\n\nMonitoring stopped. Pipeline continues running in Azure ML.")
    print(f"Check status at: {pipeline_run.studio_url}")

## Pipeline Summary

This notebook submits a complete 4-step Personal Agendas pipeline:

1. **Data Preparation**: Processes registration, scan, and session data
2. **Neo4j Preparation**: Loads data into Neo4j and creates relationships
3. **Session Embedding**: Generates embeddings for sessions
4. **Recommendations**: Generates personalized session recommendations

### Key Features:
- Fully generic and configurable (works with both `ecomm` and `vet` configurations)
- Proper data flow between steps
- Neo4j integration with secure credential management
- Incremental processing support (can be enabled per step)
- Complete error handling and logging

### Next Steps:
1. Monitor the pipeline execution in Azure ML Studio
2. Check the output metadata for each step
3. Verify recommendations are generated in Neo4j
4. Download the recommendations output file for analysis