# SAGE Pipeline Demonstration

This notebook demonstrates the SAGE (Synthetic Attribute-based Generation with agEnt-based refinement) pipeline for generating privacy-preserving synthetic medical data.

In [None]:
import os
import sys
import json
from pathlib import Path

# Add project root to path
project_root = Path().absolute().parent
sys.path.append(str(project_root))

# Import needed modules
from src.privacy.attribute_extractor import AttributeExtractor
from src.privacy.synthetic_generator import SyntheticGenerator
from src.privacy.privacy_agent import PrivacyAgent
from src.privacy.rewriting_agent import RewritingAgent
from src.privacy.sage_pipeline import SAGEPipeline

## 1. Load Sample Medical Data

Let's load some sample medical data to demonstrate the SAGE pipeline.

In [None]:
# Sample medical records (normally these would be loaded from files)
sample_records = {
    "record1": """Patient: John Smith, 45-year-old male, MRN: 12345678
Date: March 12, 2025
Chief Complaint: Chest pain and shortness of breath for 3 days
History: Patient reports intermittent chest pain, described as pressure-like, radiating to the left arm. 
Pain is worse with exertion and partially relieved with rest. He has a history of hypertension and 
hyperlipidemia, currently taking lisinopril 10mg daily and atorvastatin 20mg daily. Father had MI at age 50.
Physical Exam: BP 145/90, HR 88, RR 18, T 98.6F, O2 97% RA. Heart: Regular rate and rhythm, no murmurs. 
Lungs: Clear to auscultation bilaterally. Extremities: No edema.
Assessment: Acute coronary syndrome, needs urgent evaluation.
Plan: 1) EKG; 2) Troponin levels; 3) Aspirin 325mg; 4) Cardiology consult; 5) Admit to telemetry""",
    
    "record2": """Patient: Sarah Johnson, 32-year-old female, MRN: 87654321
Date: March 10, 2025
Chief Complaint: Severe migraine headache for 2 days
History: Patient has history of migraines since age 16. Current episode began 2 days ago, described as 
throbbing pain in right temporal region, associated with photophobia, phonophobia, and nausea. 
She typically takes sumatriptan but ran out. No fever, no visual changes, no focal weakness.
Medications: Sumatriptan PRN, Ortho Tri-Cyclen
Physical Exam: BP 118/72, HR 76, RR 16, T 98.2F. Neurological exam normal. No meningeal signs.
Assessment: Acute migraine without aura, moderate to severe
Plan: 1) Sumatriptan 100mg PO; 2) IV fluids; 3) Metoclopramide 10mg IV; 4) Refill sumatriptan prescription; 
5) Follow up with neurology if not improved in 48 hours"""
}

# Display the samples
for record_id, record_text in sample_records.items():
    print(f"Record ID: {record_id}")
    print(record_text)
    print("\n" + "-"*80 + "\n")

## 2. Initialize SAGE Pipeline Components

Now let's initialize each component of the SAGE pipeline.

In [None]:
# Initialize components
attribute_extractor = AttributeExtractor()
synthetic_generator = SyntheticGenerator()
privacy_agent = PrivacyAgent()
rewriting_agent = RewritingAgent()

# Initialize SAGE pipeline
sage_pipeline = SAGEPipeline(
    attribute_extractor=attribute_extractor,
    synthetic_generator=synthetic_generator,
    privacy_agent=privacy_agent,
    rewriting_agent=rewriting_agent,
    max_iterations=3,
    output_dir="../data/notebook_output"
)

## 3. Stage 1: Attribute-based Data Generation

Let's demonstrate Stage 1 of SAGE, which involves extracting key attributes and generating synthetic data.

In [None]:
# Extract attributes from first record
record_id = "record1"
record_text = sample_records[record_id]

# Extract attributes
attributes = attribute_extractor.extract_attributes(record_text)

# Display extracted attributes
print("Extracted Attributes:")
for attr, value in attributes.items():
    print(f"- {attr}: {value}")

# Generate synthetic data
synthetic_data = synthetic_generator.generate(attributes)

print("\nStage 1 Synthetic Data:")
print(synthetic_data)

## 4. Stage 2: Agent-based Privacy Refinement

Now let's demonstrate Stage 2, which involves privacy assessment and iterative refinement.

In [None]:
# Assess privacy
assessment = privacy_agent.assess(synthetic_data, record_text)

# Display assessment
print("Privacy Assessment:")
print(f"Is Safe: {assessment.is_safe}")
print(f"Risk Level: {assessment.risk_level}")
print(f"PII Detected: {assessment.pii_detected}")
print("Feedback:")
for item in assessment.feedback:
    print(f"- {item}")

# Refine data based on feedback
if not assessment.is_safe:
    refined_data = rewriting_agent.refine(synthetic_data, assessment.feedback)
    
    print("\nRefined Synthetic Data:")
    print(refined_data)
    
    # Assess refined data
    new_assessment = privacy_agent.assess(refined_data, record_text)
    
    print("\nUpdated Privacy Assessment:")
    print(f"Is Safe: {new_assessment.is_safe}")
    print(f"Risk Level: {new_assessment.risk_level}")
    print(f"PII Detected: {new_assessment.pii_detected}")

## 5. Complete SAGE Pipeline

Now let's process both records through the complete SAGE pipeline.

In [None]:
# Process all records
results = []

for record_id, record_text in sample_records.items():
    print(f"Processing {record_id}...")
    result = sage_pipeline.process_document(record_id, record_text)
    results.append(result)
    
    print(f"Is Safe: {result['is_safe']}")
    print(f"Iterations Required: {result['iterations_required']}")
    print("\nFinal Synthetic Data:")
    print(result['final_synthetic_data'])
    print("\n" + "-"*80 + "\n")

## 6. Compare Original vs. Synthetic Data

Let's compare the original and synthetic data to see how SAGE preserves medical utility while protecting privacy.

In [None]:
import pandas as pd
from IPython.display import display, HTML

comparison_data = []

for i, (record_id, record_text) in enumerate(sample_records.items()):
    comparison_data.append({
        "Type": "Original",
        "Record": record_id,
        "Text": record_text
    })
    
    comparison_data.append({
        "Type": "Synthetic",
        "Record": record_id,
        "Text": results[i]['final_synthetic_data']
    })

df = pd.DataFrame(comparison_data)

# Create styled HTML table
styled_df = df.style.set_properties(**{'text-align': 'left', 'white-space': 'pre-wrap'})
styled_df = styled_df.set_properties(subset=pd.IndexSlice[df['Type'] == 'Original', :], **{'background-color': '#f2f2f2'})
styled_df = styled_df.set_properties(subset=pd.IndexSlice[df['Type'] == 'Synthetic', :], **{'background-color': '#e6f7ff'})

display(HTML(styled_df.to_html()))

## 7. Analyzing Privacy Protection

Let's analyze how well SAGE protects different types of private information.

In [None]:
import re

# Define privacy categories to analyze
privacy_categories = {
    "Names": r'\b[A-Z][a-z]+ [A-Z][a-z]+\b',
    "Dates": r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b',
    "Patient IDs": r'\bMRN: \d+\b',
    "Ages": r'\b\d{1,2}-year-old\b',
    "Exact Measurements": r'\b\d{2,3}/\d{2,3}\b'  # Blood pressure, etc.
}

analysis_results = []

for record_id, record_text in sample_records.items():
    synthetic_text = next(r['final_synthetic_data'] for r in results if r['document_id'] == record_id)
    
    for category, pattern in privacy_categories.items():
        original_matches = re.findall(pattern, record_text)
        synthetic_matches = re.findall(pattern, synthetic_text)
        
        original_count = len(original_matches)
        synthetic_count = len(synthetic_matches)
        
        # Check for direct matches
        direct_matches = set(original_matches) & set(synthetic_matches)
        
        analysis_results.append({
            "Record": record_id,
            "Category": category,
            "Original Count": original_count,
            "Synthetic Count": synthetic_count,
            "Direct Matches": len(direct_matches),
            "Protection Rate": 100 if original_count == 0 else (1 - len(direct_matches) / original_count) * 100
        })

analysis_df = pd.DataFrame(analysis_results)
display(analysis_df)

# Calculate overall protection rate
overall_protection = analysis_df['Protection Rate'].mean()
print(f"Overall Privacy Protection Rate: {overall_protection:.2f}%")

## 8. Conclusion

In this notebook, we've demonstrated the SAGE pipeline for generating privacy-preserving synthetic medical data. The two-stage approach effectively:

1. Preserves the essential medical information through attribute-based extraction and generation
2. Protects private information through agent-based privacy assessment and refinement

The resulting synthetic data maintains clinical utility while significantly reducing privacy risks, making it suitable for use in a Retrieval-Augmented Generation (RAG) system for biomedical question answering.