# ServiceNow Incident Assignment Agent - Fuzzy Evaluation

This notebook evaluates an AI agent using fuzzy matching for assignment group comparisons.
It includes both exact match and fuzzy match accuracy metrics.

ToDos for MVP:

0. Pull Agent registration out of evaluate
1. Agent Deployment
2. Monitoring -> Inference Table and run drifts.

In [0]:
# Setup and Install Required Packages
%pip install -U -qqqq mlflow>=3.0.0 langgraph==0.3.4 databricks-langchain databricks-agents uv rapidfuzz backoff

In [0]:
# Restart Python to ensure clean environment
dbutils.library.restartPython()

In [0]:
from src.prompts.prompt_manager import PromptManager

catalog="marcin_demo"
schema="demo_schema_v2"
prompt_manager = PromptManager(catalog=catalog, schema=schema)

# register prompt
registered = prompt_manager.register_from_yaml("../src/prompts/templates/classification_instruction.yaml")
print(f"Registered prompts: {registered}")

In [0]:
# Import Required Libraries and Setup Paths
import sys
import os
import warnings
import mlflow

# Suppress warnings
warnings.filterwarnings('ignore')

# Add parent directory to path to import our custom modules
# For Jupyter notebooks, we need to handle path differently than regular Python files
notebook_dir = os.path.dirname(os.path.abspath(os.getcwd()))
# If we're already in the notebooks directory, go up one level
if os.path.basename(os.getcwd()) == 'notebooks':
    parent_dir = os.path.dirname(os.getcwd())
else:
    parent_dir = os.getcwd()

# Ensure the parent directory is in sys.path
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

print(f"Working directory: {os.getcwd()}")
print(f"Parent directory added to path: {parent_dir}")

# import mlflow
# from databricks_langchain import VectorSearchRetrieverTool
# from mlflow.models.resources import DatabricksFunction, DatabricksServingEndpoint
# from unitycatalog.ai.langchain.toolkit import UnityCatalogTool

# Import our evaluation modules
# from src.metrics.fuzzy_evaluator import fuzzy_evaluator, jaro_winkler_similarity, extract_assignment_group_from_json
# from src.evaluation.mlflow_evaluator import create_evaluation_dataset
from src.evaluation.mlflow_evaluation import *

# Import the agent
# from model_as_code.baseline_agent import AGENT, LLM_ENDPOINT_NAME, tools, system_prompt

# Configure MLflow 3 - Use consistent experiment name without timestamp
experiment_name = "/Users/marcin.jimenez@databricks.com/triage_recommendation"
mlflow.set_experiment(experiment_name)
mlflow.set_registry_uri('databricks-uc')

# MLflow 3: Setup production monitoring and comprehensive tracing
# setup_mlflow3_production_monitoring()

print("✅ Libraries imported with MLflow 3 GenAI capabilities")
print(f"📊 MLflow Experiment: {experiment_name}")

## Fuzzy Matching Examples

Let's demonstrate how fuzzy matching works with assignment group names:

In [0]:
from src.evaluation.metrics.fuzzy_evaluator import fuzzy_evaluator

test_pairs = [
    ("Apps Inpatient Core", "Apps Inpatient Core"),      # Exact match
    ("Apps Inpatient Core", "Apps Inpatient-Core"),      # Minor punctuation difference
    ("Apps Inpatient Core", "apps inpatient core"),      # Case difference
    ("Apps Emergency Department", "Apps Emergency Dept"),# Abbreviation
    ("Apps Laboratory", "Apps Lab"),                     # Short form
    ("Epic Security", "Epic-Security"),                  # Hyphen difference
    ("Apps Rev Cycle HIM", "Apps RevCycle HIM"),          # Spacing difference
]

print("Fuzzy Matching Pass/Fail:")
print("=" * 70)
print(f"{'Expected':<30} {'Predicted':<30} {'Pass?':<8} {'Rationale'}")
print("-" * 70)

for expected, predicted in test_pairs:
    # Simulate model output in the expected JSON shape
    outputs = {
        "incident_number": "INC123",
        "short_description": "Sample description",
        "recommended_assignment_group": predicted,
        "reason": "Example reason",
        "confidence": "90%"
    }
    # Simulate ground truth in inputs
    inputs = {
        "assignment_group": expected
    }

    feedback = fuzzy_evaluator(outputs=outputs, inputs=inputs)
    print(f"{expected:<30} {predicted:<30} {str(feedback.value):<8} {feedback.rationale}")

## Data Preview and Agent Testing

In [0]:
%sql
SELECT number as incident_number, assignment_group
FROM prod_silver.dts_ops.servicehub_task_displayvalue
LIMIT 5

In [0]:
%sql
SELECT number as incident_number, assignment_group
FROM prod_silver.dts_ops.servicehub_task_displayvalue
WHERE number ='INC4463480'

In [0]:
# Test Agent with Single Incident
import json
from src.agents.triage_agent import AGENT

LLM_ENDPOINT_NAME = "databricks-meta-llama-3-3-70b-instruct"

mlflow.models.set_model(AGENT)

response = AGENT.predict({
    "input": [
        {"role": "user", "content": "INC4463480"}
    ]
})
# response = AGENT.predict({"messages": [{"role": "user", "content": "INC4463480"}]})
print("Agent Response (agent type: %s, response type: %s)" % (type(AGENT), type(response)))
print(response.output[-1].content[0]['text'])

# Extract and show the assignment group
assignment_group = json.loads(response.output[-1].content[0]['text'])["recommended_assignment_group"]
print(f"\nExtracted Assignment Group: {assignment_group}")

## Create Evaluation Dataset

In [0]:
# Create Evaluation Dataset
from src.evaluation.mlflow_evaluation import create_evaluation_dataset
eval_dataset = create_evaluation_dataset(
    spark=spark,
    table_path="prod_silver.dts_ops.servicehub_task_displayvalue",
    sample_size=100
)

print(f"\nCreated evaluation dataset with {len(eval_dataset)} incidents")

In [0]:
eval_dataset[:5]

In [0]:
display(eval_dataset['targets'].unique())

## Register Agent into UC

In [0]:
agent_path="../src/agents/triage_agent.py"
agent_config={
    "endpoint_name": "databricks-meta-llama-3-1-8b-instruct",
    "temperature": 0.1,
    "max_tokens": 500,
    "prompt_template": "classification_instruction"
}
resources = [
    DatabricksServingEndpoint(endpoint_name=agent_config.get("endpoint_name"))
] if agent_config.get("endpoint_name") else []
input_example={
    "input": [
        {"role": "user", "content": "INC0936934"}
    ]
}
requirements=[
    "mlflow>=3.0.0",
    "databricks-sdk[openai]",
    "databricks-agents",
    "databricks-langchain",
    "langgraph",
    "backoff"
]

model_info = mlflow.pyfunc.log_model(
    name="dts_ops_triage_agent-v1",
    python_model=agent_path,
    model_config=agent_config,
    resources=resources,
    input_example=input_example,
    pip_requirements=requirements,
)

## Run Agent Evaluation with Fuzzy Matching

In [0]:
# Run Evaluation with MLflow 3 GenAI Capabilities
from src.evaluation.mlflow_evaluation import evaluate_agent
print("Starting evaluation with MLflow 3 GenAI features...")
print("="*50)

# # End any existing MLflow runs before starting a new one
# try:
#     mlflow.end_run()
#     print("Ended existing MLflow run")
# except:
#     pass

# Suppress the detailed DataFrame output
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    # Use MLflow 3 evaluation with GenAI capabilities
    # Note: We're skipping the fuzzy metric in extra_metrics due to MLflow compatibility issues
    # but it will be calculated and logged separately
    model_info, eval_results = evaluate_agent(
        model_info=model_info,
        # judge_endpoint="databricks-meta-llama-3-1-70b-instruct",
        eval_dataset=eval_dataset,
        experiment_name=experiment_name,
        # include_builtin_metrics=True,  # Disable for now to avoid conflicts
        # include_custom_metrics=True,   # Will calculate fuzzy matching separately
        # genai_evaluators=[],  # Empty list to avoid additional evaluators
    )

# Display summary
if eval_results:
    print("\n" + "="*50)
    print("MLFLOW 3 EVALUATION COMPLETE")
    print("="*50)
    print(f"\n✅ Evaluated {len(eval_dataset)} incidents")
    print("✅ Full tracing captured for all LLM interactions")
    print("✅ Results logged to MLflow with comprehensive metrics")
    
    # Display the MLflow experiment URL
    print(f"\n🔗 View results in MLflow UI:")
    print(f"   Experiment: {experiment_name}")
else:
    print("⚠️ No evaluation results available")