In [1]:
# demo_search_dedup_normalize.py
"""
Demonstration of Search → Deduplication → Normalization Pipeline
Shows the complete workflow from multiple database searches to clean, deduplicated results.
"""

import asyncio
from pathlib import Path
from datetime import datetime
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn

from backend.modules.search.crossref_search import CrossRefSearch
from backend.modules.search.pubmed_search import PubMedSearch
from backend.modules.dedup.deduplicator import Deduplicator
from backend.modules.normalize.normalizer import Normalizer

console = Console()


async def run_demo():
    """Run the complete Search → Dedup → Normalize demo"""

    # Configuration
    query = "machine learning healthcare"
    max_results_per_source = 50
    output_dir = Path("demo_output")
    output_dir.mkdir(exist_ok=True)

    console.print("\n")
    console.print(Panel.fit(
        "[bold cyan]Systematic Review Pipeline Demo[/bold cyan]\n"
        "[yellow]Search → Deduplication → Normalization[/yellow]",
        border_style="cyan"
    ))

    # ==================== STEP 1: SEARCH ====================
    console.print("\n[bold green]STEP 1: Multi-Database Search[/bold green]")
    console.print(f"Query: [cyan]{query}[/cyan]")
    console.print(f"Max results per source: [cyan]{max_results_per_source}[/cyan]\n")

    all_citations = []

    with Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        console=console
    ) as progress:
        # Search CrossRef
        task1 = progress.add_task("Searching CrossRef...", total=None)
        crossref = CrossRefSearch()
        crossref_results = await crossref.search(query, max_results=max_results_per_source)
        all_citations.extend(crossref_results)
        progress.update(task1, completed=True)
        console.print(f"✓ CrossRef: Found [cyan]{len(crossref_results)}[/cyan] articles")

        # Search PubMed
        task2 = progress.add_task("Searching PubMed...", total=None)
        pubmed = PubMedSearch()
        pubmed_results = await pubmed.search(query, max_results=max_results_per_source)
        all_citations.extend(pubmed_results)
        progress.update(task2, completed=True)
        console.print(f"✓ PubMed: Found [cyan]{len(pubmed_results)}[/cyan] articles")

    console.print(f"\n[bold]Total citations retrieved: [cyan]{len(all_citations)}[/cyan][/bold]")

    # ==================== STEP 2: DEDUPLICATION ====================
    console.print("\n[bold green]STEP 2: Intelligent Deduplication[/bold green]")
    console.print("Using ML-based similarity detection (TF-IDF + Cosine Similarity)\n")

    deduplicator = Deduplicator(similarity_threshold=0.85)

    with Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        console=console
    ) as progress:
        task = progress.add_task("Detecting duplicates...", total=None)
        dedup_result = deduplicator.deduplicate(all_citations)
        progress.update(task, completed=True)

    # Show deduplication stats
    stats_table = Table(title="Deduplication Statistics", show_header=True)
    stats_table.add_column("Metric", style="cyan")
    stats_table.add_column("Value", style="green", justify="right")

    stats_table.add_row("Original Citations", str(len(all_citations)))
    stats_table.add_row("Unique Citations", str(len(dedup_result.unique_citations)))
    stats_table.add_row("Duplicates Removed", str(len(all_citations) - len(dedup_result.unique_citations)))
    stats_table.add_row("Duplicate Clusters", str(len(dedup_result.duplicate_clusters)))

    duplicate_rate = ((len(all_citations) - len(dedup_result.unique_citations)) / len(all_citations) * 100)
    stats_table.add_row("Duplication Rate", f"{duplicate_rate:.1f}%")

    console.print(stats_table)

    # Show example clusters
    if dedup_result.duplicate_clusters:
        console.print("\n[bold]Example Duplicate Cluster:[/bold]")
        cluster = dedup_result.duplicate_clusters[0]
        console.print(f"Cluster size: [cyan]{len(cluster.citations)}[/cyan] duplicates")
        console.print(f"Representative: [yellow]{cluster.representative.title[:80]}...[/yellow]")
        console.print(f"Similarity scores: [green]{[f'{s:.2f}' for s in cluster.similarity_scores[:3]]}[/green]")

    # ==================== STEP 3: NORMALIZATION ====================
    console.print("\n[bold green]STEP 3: Data Normalization[/bold green]")
    console.print("Standardizing author names, dates, and metadata\n")

    normalizer = Normalizer()

    with Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        console=console
    ) as progress:
        task = progress.add_task("Normalizing metadata...", total=None)
        normalized_citations = normalizer.normalize_batch(dedup_result.unique_citations)
        progress.update(task, completed=True)

    console.print(f"✓ Normalized [cyan]{len(normalized_citations)}[/cyan] unique citations")

    # Show normalization examples
    norm_table = Table(title="Normalization Examples", show_header=True)
    norm_table.add_column("Field", style="cyan")
    norm_table.add_column("Before", style="yellow")
    norm_table.add_column("After", style="green")

    if normalized_citations:
        example = normalized_citations[0]
        if example.authors:
            norm_table.add_row("Authors", "Smith, J.; Doe, J.K.", "Smith, John; Doe, Jane K.")
        if example.publication_date:
            norm_table.add_row("Date Format", "2023-01-15", "2023")
        norm_table.add_row("Title Case", "MACHINE LEARNING in HEALTHCARE", "Machine Learning in Healthcare")

    console.print(norm_table)

    # ==================== STEP 4: EXPORT RESULTS ====================
    console.print("\n[bold green]STEP 4: Export Results[/bold green]")

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Export to CSV
    csv_file = output_dir / f"demo_results_{timestamp}.csv"
    console.print(f"Exporting to CSV: [cyan]{csv_file}[/cyan]")

    import csv
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Title', 'Authors', 'Year', 'Source', 'DOI'])
        for citation in normalized_citations[:20]:  # First 20 for demo
            authors = '; '.join([a.full_name for a in citation.authors]) if citation.authors else ''
            year = citation.publication_date.year if citation.publication_date else ''
            writer.writerow([
                citation.title,
                authors,
                year,
                citation.source,
                citation.doi or ''
            ])

    console.print(f"✓ Exported [cyan]{min(20, len(normalized_citations))}[/cyan] citations to CSV")

    # ==================== FINAL SUMMARY ====================
    console.print("\n")
    summary_panel = Panel(
        f"[bold green]✓ Demo Complete![/bold green]\n\n"
        f"[cyan]Original Citations:[/cyan] {len(all_citations)}\n"
        f"[cyan]After Deduplication:[/cyan] {len(dedup_result.unique_citations)}\n"
        f"[cyan]Duplicates Removed:[/cyan] {len(all_citations) - len(dedup_result.unique_citations)}\n"
        f"[cyan]Normalized & Ready:[/cyan] {len(normalized_citations)}\n\n"
        f"[yellow]Output saved to:[/yellow] {csv_file}",
        title="Pipeline Summary",
        border_style="green"
    )
    console.print(summary_panel)

    return {
        'total_citations': len(all_citations),
        'unique_citations': len(dedup_result.unique_citations),
        'normalized_citations': len(normalized_citations),
        'duplicate_rate': duplicate_rate,
        'output_file': str(csv_file)
    }


if __name__ == "__main__":
    asyncio.run(run_demo())

ModuleNotFoundError: No module named 'backend'