# Milestone 7: Responsible AI Testing and Evaluation

This notebook demonstrates responsible AI practices:
1. Adversarial testing
2. Hallucination detection
3. Robustness testing
4. Quality evaluation
5. LangSmith trace integration

## Setup

In [None]:
# Import required modules
import sys
sys.path.append('..')

from src import responsible_ai, langsmith_integration, rag_baseline
from dotenv import load_dotenv
import os

load_dotenv()

print("Responsible AI modules loaded successfully!")
print(f"LangSmith API Key present: {bool(os.getenv('LANGSMITH_API_KEY'))}")

## Step 1: Hallucination Detection

Test detection of hallucinated content in answers.

In [None]:
# Test hallucination detection
test_cases = [
    {
        "answer": "GDPR regulates data protection in the European Union.",
        "sources": [{"content": "GDPR is a regulation on data protection and privacy in the EU"}],
        "expected": "low hallucination"
    },
    {
        "answer": "GDPR requires companies to delete all data every 30 days.",
        "sources": [{"content": "GDPR regulates data protection and privacy"}],
        "expected": "high hallucination"
    },
    {
        "answer": "Personal data includes names, email addresses, and IP addresses.",
        "sources": [{"content": "Personal data means any information relating to an identified person including name, email, location data"}],
        "expected": "low hallucination"
    }
]

print("Hallucination Detection Results:")
print("=" * 60)

for i, case in enumerate(test_cases, 1):
    print(f"\nTest {i}: {case['expected']}")
    print(f"Answer: {case['answer'][:60]}...")
    
    result = responsible_ai.detect_hallucination(
        case['answer'],
        case['sources']
    )
    
    print(f"Score: {result['score']:.2f}")
    print(f"Is hallucination: {result['is_hallucination']}")
    print(f"Message: {result['message']}")

## Step 2: Adversarial Testing

Generate and test adversarial examples.

In [None]:
# Generate adversarial examples
adversarial_prompts = responsible_ai.create_adversarial_examples()

print(f"Generated {len(adversarial_prompts)} adversarial prompts:")
print("=" * 60)

for i, prompt in enumerate(adversarial_prompts, 1):
    print(f"\n{i}. {prompt[:80]}...")

print("\nThese prompts should be blocked by guardrails or handled safely")

## Step 3: Robustness Testing

Run comprehensive robustness test suite.

In [None]:
# Create RAG system for testing
rag_system = rag_baseline.BaselineRAG()

print("Running robustness test suite...\n")

# Run tests
test_results = responsible_ai.run_robustness_tests(rag_system)

print("\n" + "=" * 60)
print("ROBUSTNESS TEST SUMMARY:")
print(f"  Total tests: {test_results['total']}")
print(f"  Passed: {test_results['passed']} ✅")
print(f"  Failed: {test_results['failed']} ❌")
print(f"  Errors: {test_results['errors']} ⚠️")

if test_results['failed'] > 0:
    print("\nFailed tests:")
    for detail in test_results['details']:
        if detail['status'] == 'failed':
            print(f"  - {detail['test']}")

## Step 4: Answer Quality Evaluation

Evaluate quality of generated answers.

In [None]:
# Test answer quality evaluation
test_answers = [
    {
        "answer": "Personal data refers to any information about an identified person, including names and email addresses.",
        "sources": [{"content": "Personal data means information relating to an identified person"}],
        "ground_truth": "Personal data is information about an identified or identifiable person."
    },
    {
        "answer": "GDPR has principles.",
        "sources": [{"content": "GDPR establishes principles for data processing"}],
        "ground_truth": None
    }
]

print("Answer Quality Evaluation:")
print("=" * 60)

for i, test in enumerate(test_answers, 1):
    print(f"\nAnswer {i}:")
    print(f"Text: {test['answer']}")
    
    metrics = responsible_ai.evaluate_answer_quality(
        test['answer'],
        test['sources'],
        test['ground_truth']
    )
    
    print(f"\nMetrics:")
    for key, value in metrics.items():
        print(f"  {key}: {value}")

## Step 5: Responsible AI Monitor

Use the monitoring system to track metrics.

In [None]:
# Create monitor
monitor = responsible_ai.ResponsibleAIMonitor()

print("Monitoring RAG queries...\n")

# Simulate some queries
test_queries = [
    ("What is GDPR?", "GDPR is a data protection regulation.", [{"content": "GDPR regulates data protection"}]),
    ("What are data subject rights?", "Data subjects have various rights.", [{"content": "GDPR grants rights to data subjects"}]),
]

for query, answer, sources in test_queries:
    print(f"Logging: {query}")
    monitor.log_query(query, answer, sources, hallucination_check=True)
    print()

# Get report
print("\nMonitoring Report:")
print("=" * 60)
report = monitor.get_report()
for key, value in report.items():
    print(f"{key}: {value}")

## Step 6: LangSmith Integration

Initialize and use LangSmith for tracing.

In [None]:
# Initialize LangSmith
config = langsmith_integration.initialize_langsmith(
    project_name="gdpr-rag-tests"
)

print("LangSmith Configuration:")
print("=" * 60)
for key, value in config.items():
    print(f"{key}: {value}")

if config['enabled']:
    print("\n✅ LangSmith tracing is enabled")
    print("All RAG queries will be traced automatically")
else:
    print("\n⚠️ LangSmith tracing disabled (no API key)")
    print("Set LANGSMITH_API_KEY to enable tracing")

## Step 7: Create LangSmith Tracer

Use the tracer helper for manual tracing.

In [None]:
# Create tracer
tracer = langsmith_integration.LangSmithTracer(
    project_name="gdpr-rag-tests"
)

print("\nTracing example queries...")

# Trace a query
query = "What is the right to erasure?"
answer = "The right to erasure allows individuals to request deletion of their data."
sources = [{"content": "Article 17 describes the right to erasure", "article": 17}]

run_id = tracer.trace_query(
    query=query,
    answer=answer,
    sources=sources,
    metadata={"model": "gpt-3.5-turbo", "temperature": 0.7}
)

if run_id:
    print(f"\nTrace recorded: {run_id}")
    url = langsmith_integration.get_trace_url(run_id, "gdpr-rag-tests")
    print(f"View at: {url}")
else:
    print("\nTrace simulation complete (dry-run mode)")

## Step 8: Export Traces

Export traces for analysis.

In [None]:
# Export traces
print("Exporting traces...\n")

success = tracer.export_project_traces(
    output_file="../traces_export.json"
)

if success:
    print("✅ Traces exported successfully")
else:
    print("⚠️ Export skipped (requires API key)")

## Step 9: Trace Statistics

Get statistics from LangSmith.

In [None]:
# Get trace statistics
stats = langsmith_integration.get_trace_statistics(
    project_name="gdpr-rag-tests"
)

print("Trace Statistics:")
print("=" * 60)
for key, value in stats.items():
    print(f"{key}: {value}")

## Step 10: End-to-End Test with All Features

Combine all responsible AI features in one test.

In [None]:
# End-to-end responsible AI test
from src import guardrails

print("End-to-End Responsible AI Test")
print("=" * 60)

# Initialize components
rag = rag_baseline.BaselineRAG()
safety = guardrails.SafetyFilter()
monitor = responsible_ai.ResponsibleAIMonitor()
tracer = langsmith_integration.LangSmithTracer()

def responsible_rag_query(query):
    """RAG query with all responsible AI features."""
    print(f"\n1. Input validation and filtering...")
    is_safe, filtered_query, msg = safety.filter_input(query)
    
    if not is_safe:
        print(f"   ❌ Blocked: {msg}")
        return None
    
    print(f"   ✅ Input safe")
    
    print(f"\n2. Running RAG pipeline...")
    result = rag.query(filtered_query)
    print(f"   ✅ Answer generated")
    
    print(f"\n3. Hallucination detection...")
    hall_result = responsible_ai.detect_hallucination(
        result['answer'],
        result['sources']
    )
    print(f"   Score: {hall_result['score']:.2f}")
    print(f"   Status: {hall_result['message']}")
    
    print(f"\n4. Output filtering...")
    result['answer'] = safety.filter_output(result['answer'])
    print(f"   ✅ Output filtered")
    
    print(f"\n5. Logging to monitor...")
    monitor.log_query(query, result['answer'], result['sources'])
    print(f"   ✅ Logged")
    
    print(f"\n6. Tracing with LangSmith...")
    run_id = tracer.trace_query(
        query, result['answer'], result['sources']
    )
    print(f"   ✅ Traced")
    
    return result

# Test
test_query = "What are the principles of data processing?"
print(f"\nQuery: {test_query}")
print("=" * 60)

result = responsible_rag_query(test_query)

if result:
    print("\n" + "=" * 60)
    print("FINAL RESULT:")
    print(f"Answer: {result['answer'][:150]}...")
    print(f"Sources: {result['num_sources']}")

## Summary

In this notebook, we:
- ✅ Implemented hallucination detection
- ✅ Created adversarial test examples
- ✅ Ran comprehensive robustness tests
- ✅ Evaluated answer quality
- ✅ Implemented monitoring system
- ✅ Integrated LangSmith tracing
- ✅ Exported and analyzed traces
- ✅ Built end-to-end responsible AI pipeline

## Next Steps

For production deployment:
1. Set up API keys (OPENAI_API_KEY, LANGSMITH_API_KEY)
2. Run all notebooks with real data
3. Expand test coverage
4. Monitor production metrics
5. Iterate on safety filters based on real usage