# Notebook 7: Responsible AI Testing and LangSmith Tracing

This notebook demonstrates:
1. Adversarial testing examples
2. Hallucination detection
3. Robustness evaluation
4. LangSmith trace export
5. Test report generation

In [None]:
# Import required modules
import os
from dotenv import load_dotenv
from src.responsible_ai import (
    detect_hallucination,
    run_adversarial_tests,
    calculate_robustness_score,
    generate_test_report,
    export_metrics_for_langsmith
)
from src.langsmith_integration import (
    initialize_langsmith,
    log_trace,
    export_traces,
    setup_langsmith_environment
)
from src.rag_baseline import BaselineRAG

load_dotenv()
print("✓ Imports successful")

## Step 1: Initialize LangSmith

In [None]:
# Initialize LangSmith for tracing
langsmith_info = initialize_langsmith(
    api_key=os.getenv("LANGSMITH_API_KEY"),
    project_name="gdpr-rag-testing"
)

print("LangSmith Status:")
print(f"  Enabled: {langsmith_info['enabled']}")
print(f"  Project: {langsmith_info['project']}")
print(f"  Status: {langsmith_info['status']}")

if not langsmith_info['enabled']:
    print("\n⚠️  Set LANGSMITH_API_KEY to enable full tracing")

## Step 2: Hallucination Detection

In [None]:
# Test hallucination detection
test_cases = [
    {
        "answer": "GDPR Article 15 grants data subjects the right to access their personal data.",
        "docs": [
            {"content": "Article 15: The data subject shall have the right to obtain from the controller confirmation as to whether or not personal data concerning him or her are being processed...", "metadata": {"article": 15}}
        ],
        "expected": "grounded"
    },
    {
        "answer": "GDPR Article 999 states that you can delete all data immediately.",
        "docs": [
            {"content": "Article 17: The data subject shall have the right to obtain from the controller the erasure...", "metadata": {"article": 17}}
        ],
        "expected": "hallucination"
    }
]

print("Hallucination Detection Results:")
print(f"{'='*60}\n")

hallucination_results = []

for i, test in enumerate(test_cases, 1):
    result = detect_hallucination(test["answer"], test["docs"])
    hallucination_results.append(result)
    
    status = "✗ HALLUCINATION" if result['likely_hallucination'] else "✓ GROUNDED"
    print(f"Test {i}: {status}")
    print(f"  Answer: {test['answer'][:60]}...")
    print(f"  Overlap: {result['overlap_score']:.2%}")
    print(f"  Confidence: {result['confidence']:.2%}")
    print(f"  Reason: {result['reason']}")
    print()

## Step 3: Run Adversarial Tests

In [None]:
# Initialize RAG system for testing
rag_system = BaselineRAG(
    faiss_path="faiss_index/",
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

# Run adversarial test suite
test_results = run_adversarial_tests(rag_system)

print(f"\nAdversarial Test Results:")
print(f"{'='*60}")
print(f"Tests Passed: {test_results['num_passed']}/{test_results['num_total']}")
print(f"Pass Rate: {test_results['pass_rate']:.1%}")
print(f"\nBy Category:")
for category, stats in test_results['by_category'].items():
    rate = stats['passed'] / stats['total'] if stats['total'] > 0 else 0
    print(f"  {category}: {stats['passed']}/{stats['total']} ({rate:.1%})")

## Step 4: Calculate Robustness Score

In [None]:
# Calculate overall robustness
robustness = calculate_robustness_score(test_results)

print(f"\nRobustness Score: {robustness:.2f}/1.00")

if robustness >= 0.8:
    print("✓ Excellent robustness")
elif robustness >= 0.6:
    print("⚠️  Good robustness, some improvements needed")
else:
    print("✗ Poor robustness, significant improvements needed")

## Step 5: Generate Test Report

In [None]:
# Generate comprehensive test report
report = generate_test_report(test_results)

print(report)

# Save report to file
report_path = "test_reports/adversarial_test_report.txt"
os.makedirs("test_reports", exist_ok=True)
with open(report_path, "w") as f:
    f.write(report)

print(f"\n✓ Report saved to: {report_path}")

## Step 6: Log Traces to LangSmith

In [None]:
# Log a sample trace
trace_info = log_trace(
    run_name="adversarial_test_run",
    inputs={"query": "What are data subject rights?"},
    outputs={
        "answer": "Data subjects have several rights...",
        "sources": [15, 17, 20]
    },
    metadata={
        "test_type": "adversarial",
        "pass_rate": test_results['pass_rate']
    }
)

print(f"\nTrace logged:")
print(f"  Trace ID: {trace_info['trace_id']}")
print(f"  Status: {trace_info['status']}")

## Step 7: Export Metrics to LangSmith

In [None]:
# Export comprehensive metrics
metrics = export_metrics_for_langsmith(
    test_results=test_results,
    hallucination_results=hallucination_results,
    additional_metrics={
        "system": "gdpr-rag",
        "version": "1.0"
    }
)

print("\nExported Metrics:")
print(f"{'='*60}")
import json
print(json.dumps(metrics, indent=2))

# Save metrics
metrics_path = "test_reports/metrics.json"
with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=2)

print(f"\n✓ Metrics saved to: {metrics_path}")

## Step 8: Export LangSmith Traces

In [None]:
# Export all traces from the project
export_result = export_traces(
    project_name="gdpr-rag-testing",
    output_path="traces/exported_traces.json"
)

print(f"\nTrace Export:")
print(f"  Success: {export_result['success']}")
print(f"  File: {export_result['file']}")
print(f"  Count: {export_result['count']}")
print(f"  Status: {export_result['status']}")

## Step 9: Setup Continuous Monitoring

In [None]:
# Setup environment for continuous LangSmith monitoring
env_vars = setup_langsmith_environment(
    api_key=os.getenv("LANGSMITH_API_KEY"),
    project_name="gdpr-rag-production",
    tracing_enabled=True
)

print("\nLangSmith Environment:")
for key, value in env_vars.items():
    if "KEY" not in key:
        print(f"  {key}: {value}")
    else:
        print(f"  {key}: {'***' if value else 'not set'}")

## Step 10: Summary and Recommendations

In [None]:
# Generate final summary
print(f"\n{'='*60}")
print("RESPONSIBLE AI SUMMARY")
print(f"{'='*60}\n")

print(f"✓ Adversarial Testing Complete")
print(f"  - Pass Rate: {test_results['pass_rate']:.1%}")
print(f"  - Robustness Score: {robustness:.2f}")

print(f"\n✓ Hallucination Detection Active")
hallucination_rate = sum(1 for r in hallucination_results if r['likely_hallucination']) / len(hallucination_results)
print(f"  - Detection Rate: {hallucination_rate:.1%}")

print(f"\n✓ LangSmith Integration")
print(f"  - Status: {langsmith_info['status']}")
print(f"  - Traces Exported: {export_result['count']}")

print(f"\n{'='*60}")
print("RECOMMENDATIONS:")
print(f"{'='*60}\n")

if robustness < 0.8:
    print("⚠️  Improve robustness by:")
    print("   - Enhancing input guardrails")
    print("   - Adding more adversarial training data")
    print("   - Implementing stricter output validation")

if hallucination_rate > 0.1:
    print("⚠️  Reduce hallucinations by:")
    print("   - Using more grounded generation")
    print("   - Increasing retrieval quality")
    print("   - Adding citation verification")

if not langsmith_info['enabled']:
    print("⚠️  Enable LangSmith for:")
    print("   - Production monitoring")
    print("   - Debugging and tracing")
    print("   - Compliance auditing")

print(f"\n✓ All responsible AI checks complete!")

## Summary

In this notebook, we:
- ✓ Initialized LangSmith tracing
- ✓ Performed hallucination detection
- ✓ Ran adversarial test suite
- ✓ Calculated robustness scores
- ✓ Generated test reports
- ✓ Exported metrics and traces
- ✓ Set up continuous monitoring

## Next Steps

1. Review test reports and address failures
2. Enable LangSmith for production monitoring
3. Implement continuous evaluation pipeline
4. Set up alerts for quality degradation
5. Regular security audits and red teaming