# Notebook 7: Responsible AI and Testing

This notebook demonstrates:
1. Adversarial testing
2. Hallucination detection
3. Robustness test harness
4. LangSmith trace export
5. Comprehensive evaluation

In [None]:
import sys
sys.path.append('..')

from src.responsible_ai import (
    detect_hallucination,
    check_citation_accuracy,
    RobustnessTestHarness,
    create_test_report,
    evaluate_rag_quality
)
from src.langsmith_integration import (
    initialize_langsmith,
    enable_tracing,
    export_traces,
    print_tracing_status
)
from src.rag_baseline import BaselineRAG
from dotenv import load_dotenv
import os

load_dotenv()
print("✓ Imports successful")

## Step 1: Hallucination Detection

In [None]:
# Test hallucination detection
rag = BaselineRAG("../faiss_index", os.getenv("OPENAI_API_KEY"))

query = "What are the key principles of GDPR?"
sources = rag.retrieve(query)
answer = rag.generate_answer(query, sources)

is_hallucination, score, explanation = detect_hallucination(answer, sources)

print(f"Query: {query}")
print(f"\nHallucination Check:")
print(f"  Overlap Score: {score:.2%}")
print(f"  Is Hallucination: {is_hallucination}")
print(f"  Explanation: {explanation}")

## Step 2: Citation Accuracy

In [None]:
# Check citation accuracy
citation_result = check_citation_accuracy(answer, sources)

print(f"Citation Analysis:")
print(f"  Total citations: {citation_result['num_citations']}")
print(f"  Valid citations: {citation_result['valid_citations']}")
print(f"  Accuracy: {citation_result['accuracy']:.2%}")
print(f"  Has citations: {citation_result['has_citations']}")

## Step 3: Adversarial Testing

In [None]:
# Run adversarial tests
harness = RobustnessTestHarness(rag)
adversarial_results = harness.run_adversarial_tests()

# Summary
safe_count = sum(1 for r in adversarial_results if r.get('is_safe', False))
print(f"\nAdversarial Test Summary:")
print(f"  Total tests: {len(adversarial_results)}")
print(f"  Passed safely: {safe_count}")
print(f"  Pass rate: {safe_count/len(adversarial_results):.1%}")

## Step 4: Edge Case Testing

In [None]:
# Run edge case tests
edge_results = harness.run_edge_case_tests()

# Summary
graceful_count = sum(1 for r in edge_results if r.get('handles_gracefully', False))
print(f"\nEdge Case Test Summary:")
print(f"  Total tests: {len(edge_results)}")
print(f"  Handled gracefully: {graceful_count}")
print(f"  Success rate: {graceful_count/len(edge_results):.1%}")

## Step 5: Consistency Testing

In [None]:
# Test answer consistency
consistency_result = harness.run_consistency_tests(
    "What are the key principles of GDPR?",
    num_runs=3
)

print(f"\nConsistency Test:")
print(f"  Query: {consistency_result['query']}")
print(f"  Runs: {consistency_result['num_runs']}")
print(f"  Unique answers: {consistency_result['unique_answers']}")
print(f"  Consistency score: {consistency_result['consistency_score']:.2%}")

## Step 6: Comprehensive Evaluation

In [None]:
# Evaluate on multiple queries
test_queries = [
    "What are the key principles of GDPR?",
    "What rights do individuals have?",
    "What are the penalties for violations?"
]

evaluation = evaluate_rag_quality(rag, test_queries)

print(f"\nEvaluation Summary:")
print(f"  Total queries: {evaluation['total_queries']}")
print(f"  Successful: {evaluation['successful_queries']}")
print(f"  Avg overlap score: {evaluation['avg_overlap_score']:.2%}")
print(f"  Hallucination rate: {evaluation['hallucination_rate']:.2%}")
print(f"  Avg citation accuracy: {evaluation['avg_citation_accuracy']:.2%}")

## Step 7: Complete Test Report

In [None]:
# Generate comprehensive test report
report = create_test_report(rag, output_path="../test_report.json")

print(f"\nTest report generated with:")
print(f"  Adversarial tests: {len(report['adversarial_tests'])}")
print(f"  Edge case tests: {len(report['edge_case_tests'])}")
print(f"  Total tests: {report['summary']['total_tests']}")

## Step 8: LangSmith Integration

In [None]:
# Check LangSmith status
print_tracing_status()

# Initialize LangSmith (optional)
client = initialize_langsmith(project_name="gdpr-rag-evaluation")

if client:
    print("LangSmith client initialized")
else:
    print("LangSmith not available (set LANGSMITH_API_KEY to enable)")

In [None]:
# Export traces (if available)
if client:
    success = export_traces(
        client,
        "gdpr-rag-evaluation",
        "../traces_export.json",
        limit=50
    )
    print(f"Trace export: {'✓ Success' if success else '✗ Failed'}")
else:
    print("Trace export skipped (no LangSmith client)")

## Summary

✓ Tested hallucination detection
✓ Verified citation accuracy
✓ Ran adversarial tests
✓ Tested edge cases
✓ Evaluated consistency
✓ Generated comprehensive report
✓ Integrated LangSmith tracing

## Next Steps

1. Review test report at `../test_report.json`
2. Set LANGSMITH_API_KEY for production tracing
3. Add more test cases as needed
4. Deploy with appropriate guardrails
5. Monitor production performance