# Tool 3 - Quality Validator (Hybrid Pattern)

**Status:** ‚úÖ Ready for Databricks | **LLM Cost:** $0 (deterministic) or ~$0.002 (hybrid) | **Performance:** <1s or ~10s

**Pattern:** Deterministic coverage checks + optional LLM enhancement

**Showcase:** Hybrid validation - fast deterministic metrics + smart LLM risk assessment (configurable).

**Key Features:**
- Hybrid async function with configurable LLM fallback (`use_llm_enhancement=True/False`)
- Deterministic coverage metrics: description, owner, source (always runs, <1s)
- Optional LLM enhancement: risk level (LOW/MEDIUM/HIGH/CRITICAL), text quality, P0/P1/P2 recommendations
- Anomaly detection: orphan entities, missing owners, suspicious patterns
- Two modes: `hybrid` (default) or `deterministic_only`

**TODO:**
- [ ] Add lineage coverage metric (upstream/downstream sources)
- [ ] Validate recommendation priority distribution (not all P0)
- [ ] Test with low-quality metadata (0% coverage scenarios)
- [ ] Add trend analysis (quality over time)

**IDEA:**
- Cache LLM enhancement results for unchanged structures (avoid re-analysis)
- Add custom quality rules engine (user-defined thresholds)
- Export recommendations as actionable Jira/GitHub issues

In [None]:
# Install dependencies
%pip install pydantic-ai>=0.0.49 pydantic>=2.8.0

In [None]:
# Restart Python kernel to use new packages
dbutils.library.restartPython()  # type: ignore

In [None]:
import asyncio
import json
import os
from datetime import datetime
from pydantic import BaseModel, Field
from pydantic_ai import Agent  # type: ignore

In [None]:
# Configure Azure OpenAI from Databricks secrets
AZURE_ENDPOINT = dbutils.secrets.get(scope="mcop", key="azure-openai-endpoint")  # type: ignore
AZURE_API_KEY = dbutils.secrets.get(scope="mcop", key="azure-openai-api-key")  # type: ignore
DEPLOYMENT_NAME = dbutils.secrets.get(scope="mcop", key="azure-openai-deployment-name")  # type: ignore

# Set environment variables for Pydantic AI
os.environ["OPENAI_BASE_URL"] = AZURE_ENDPOINT
os.environ["OPENAI_API_KEY"] = AZURE_API_KEY

MODEL_NAME = f"openai:{DEPLOYMENT_NAME}"
print(f"‚úÖ Configured model: {MODEL_NAME}")

In [None]:
# Pydantic schemas
class Recommendation(BaseModel):
    """Quality improvement recommendation."""
    priority: str = Field(description="P0 (critical), P1 (high), P2 (medium)")
    category: str = Field(description="Category: coverage, consistency, naming, security")
    message: str = Field(description="Actionable recommendation text")

class AnomalyNote(BaseModel):
    """Data quality anomaly."""
    entity: str = Field(description="Entity or table name with anomaly")
    anomaly_type: str = Field(description="Type: orphan_entity, missing_owner, suspicious_pattern")
    severity: str = Field(description="Severity: high, medium, low")
    details: str = Field(description="Anomaly details")

class LLMEnhancement(BaseModel):
    """LLM-enhanced quality assessment."""
    risk_level: str = Field(description="Overall risk: LOW, MEDIUM, HIGH, CRITICAL")
    text_quality_score: float = Field(description="Text quality 0-1")
    recommendations: list[Recommendation] = Field(description="P0/P1/P2 recommendations")
    anomalies: list[AnomalyNote] = Field(default_factory=list, description="Detected anomalies")
    summary: str = Field(description="Executive summary")

class CoverageMetrics(BaseModel):
    """Deterministic coverage metrics."""
    description_coverage: float = Field(description="Fraction of entities with descriptions")
    owner_coverage: float = Field(description="Fraction of entities with owners")
    source_coverage: float = Field(description="Fraction of entities with sources")
    total_entities: int = Field(description="Total entities evaluated")
    timestamp: str = Field(description="ISO 8601 timestamp")

class QualityReport(BaseModel):
    """Complete quality validation report."""
    coverage: CoverageMetrics
    llm_enhancement: LLMEnhancement | None = None
    execution_time_seconds: float
    mode: str = Field(description="hybrid or deterministic_only")

In [None]:
# Create LLM enhancement agent
enhancement_agent = Agent(
    MODEL_NAME,
    result_type=LLMEnhancement,
    system_prompt="""You are a metadata quality expert.

Analyze the provided structure and assess:
1. Risk level (LOW/MEDIUM/HIGH/CRITICAL) based on coverage gaps
2. Text quality score (0-1) for descriptions and documentation
3. Prioritized recommendations (P0=critical, P1=high, P2=medium)
4. Data quality anomalies (orphan entities, missing owners, suspicious patterns)
5. Executive summary for stakeholders

Be specific and actionable."""
)

print("‚úÖ Enhancement agent created")

In [None]:
async def validate_quality(structure: dict, use_llm_enhancement: bool = True) -> QualityReport:
    """Hybrid quality validation: deterministic coverage + optional LLM enhancement.

    Args:
        structure: Tool 2 structural classification output
        use_llm_enhancement: If True, runs LLM analysis (~10s). If False, only coverage (<1s).

    Returns:
        QualityReport with coverage metrics and optional LLM assessment
    """
    start_time = datetime.now()

    # Step 1: Deterministic coverage (always runs, <1s)
    all_entities = structure.get("facts", []) + structure.get("dimensions", [])
    total = len(all_entities)

    description_count = sum(1 for e in all_entities if e.get("description"))
    owner_count = sum(1 for e in all_entities if e.get("owner"))
    source_count = sum(1 for e in all_entities if e.get("source"))

    coverage = CoverageMetrics(
        description_coverage=description_count / total if total > 0 else 0.0,
        owner_coverage=owner_count / total if total > 0 else 0.0,
        source_coverage=source_count / total if total > 0 else 0.0,
        total_entities=total,
        timestamp=datetime.now().isoformat()
    )

    # Step 2: Optional LLM enhancement (~10s)
    llm_result = None
    if use_llm_enhancement:
        prompt = f"""Analyze this metadata structure:

Coverage Metrics:
- Description coverage: {coverage.description_coverage:.1%}
- Owner coverage: {coverage.owner_coverage:.1%}
- Source coverage: {coverage.source_coverage:.1%}

Structure:
{json.dumps(structure, indent=2)}

Provide risk assessment, quality score, recommendations (P0/P1/P2), anomalies, and summary."""

        result = await enhancement_agent.run(prompt)
        llm_result = result.data

    execution_time = (datetime.now() - start_time).total_seconds()
    mode = "hybrid" if use_llm_enhancement else "deterministic_only"

    return QualityReport(
        coverage=coverage,
        llm_enhancement=llm_result,
        execution_time_seconds=execution_time,
        mode=mode
    )

print("‚úÖ Hybrid validation function defined")

In [None]:
# Load Tool 2 structure from DBFS
structure_path = "/dbfs/FileStore/mcop/tool2/structure.json"

with open(structure_path, "r") as f:
    structure = json.load(f)

print(f"‚úÖ Loaded structure from: {structure_path}")
print(f"   Facts: {len(structure.get('facts', []))}")
print(f"   Dimensions: {len(structure.get('dimensions', []))}")

In [None]:
# Run hybrid validation (deterministic + LLM)
report = await validate_quality(structure, use_llm_enhancement=True)

print(f"\n‚úÖ Validation complete ({report.mode} mode)")
print(f"   Execution time: {report.execution_time_seconds:.2f}s")
print(f"\nüìä Coverage Metrics:")
print(f"   Description: {report.coverage.description_coverage:.1%}")
print(f"   Owner: {report.coverage.owner_coverage:.1%}")
print(f"   Source: {report.coverage.source_coverage:.1%}")

if report.llm_enhancement:
    print(f"\nü§ñ LLM Enhancement:")
    print(f"   Risk Level: {report.llm_enhancement.risk_level}")
    print(f"   Text Quality: {report.llm_enhancement.text_quality_score:.2f}")
    print(f"   Recommendations: {len(report.llm_enhancement.recommendations)}")
    print(f"   Anomalies: {len(report.llm_enhancement.anomalies)}")

In [None]:
# Save quality report to DBFS
output_path = "/dbfs/FileStore/mcop/tool3/quality_report.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, "w") as f:
    json.dump(report.model_dump(), f, indent=2)

print(f"‚úÖ Quality report saved: {output_path}")

In [None]:
# Display detailed report
print("\n" + "="*80)
print("QUALITY VALIDATION REPORT")
print("="*80)

print(f"\nüìà Coverage Summary:")
print(f"   Total entities: {report.coverage.total_entities}")
print(f"   Description coverage: {report.coverage.description_coverage:.1%}")
print(f"   Owner coverage: {report.coverage.owner_coverage:.1%}")
print(f"   Source coverage: {report.coverage.source_coverage:.1%}")

if report.llm_enhancement:
    print(f"\nüîç Risk Assessment:")
    print(f"   Risk Level: {report.llm_enhancement.risk_level}")
    print(f"   Text Quality: {report.llm_enhancement.text_quality_score:.2f}")

    print(f"\nüí° Recommendations ({len(report.llm_enhancement.recommendations)}):")
    for rec in report.llm_enhancement.recommendations[:5]:  # Top 5
        print(f"   [{rec.priority}] {rec.category}: {rec.message}")

    print(f"\n‚ö†Ô∏è  Anomalies ({len(report.llm_enhancement.anomalies)}):")
    for anomaly in report.llm_enhancement.anomalies[:3]:  # Top 3
        print(f"   [{anomaly.severity.upper()}] {anomaly.anomaly_type}: {anomaly.details}")

    print(f"\nüìù Executive Summary:")
    print(f"   {report.llm_enhancement.summary}")

print(f"\n‚è±Ô∏è  Execution: {report.execution_time_seconds:.2f}s ({report.mode} mode)")
print("="*80)