In [3]:
import sys
sys.path.insert(0, '..')

from core.web_discovery import WebDiscovery
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

# Initialize web discovery
discovery = WebDiscovery()
sources_dir = Path('../data/sources')
sources_dir.mkdir(parents=True, exist_ok=True)

print("‚úì Web Discovery initialized")


‚úì Web Discovery initialized


## Step 1: Choose Your Approach

Select how to guide the source discovery process.

In [None]:
# ===================================================================
# CHOOSE YOUR APPROACH: Uncomment ONE of the three options below
# ===================================================================

# Option 1: Manual research topic
# research_topic = "EU Data Act and Linked Data governance frameworks"
# use_graph_mode = False

# Option 2: Auto-extract from existing documents
# research_topic = None
# use_graph_mode = False

# Option 3: Use knowledge graph (recommended if you have a refined graph) ‚≠ê
research_topic = None
use_graph_mode = True
graph_path = "../data/graphs/knowledge_graph.ttl"  # Path to your knowledge graph

# ===================================================================

from pathlib import Path
from markitdown import MarkItDown
from core.rag_engine import VaultRAG

# Initialize MarkItDown for DOCX conversion
md_converter = MarkItDown()

# Step 1: Convert any DOCX files to Markdown
docx_files = list(sources_dir.glob("*.docx"))
if docx_files:
    print(f"üìÑ Converting {len(docx_files)} DOCX file(s)...\n")
    for docx_path in docx_files:
        try:
            result = md_converter.convert(str(docx_path))
            md_path = docx_path.with_suffix('.md')
            with open(md_path, 'w', encoding='utf-8') as f:
                f.write(result.text_content)
            docx_path.unlink()
            print(f"  ‚úì Converted: {md_path.name}")
        except Exception as e:
            print(f"  ‚úó Error: {e}")
    print()

# Step 2: Load documents and/or knowledge graph
if use_graph_mode:
    print("üîç Graph Mode: Loading knowledge graph...\n")
    
    # Check if graph exists
    graph_file = Path(graph_path)
    if not graph_file.exists():
        print(f"  ‚ö†Ô∏è Knowledge graph not found at: {graph_path}")
        print(f"  üí° Run 'python build_graph.py' to create a knowledge graph first")
        print(f"  Falling back to document extraction mode\n")
        use_graph_mode = False
    else:
        # Load the existing knowledge graph
        rag = VaultRAG(sources_dir=str(sources_dir), verbose=False)
        
        # Load graph from TTL file
        try:
            rag.rdf_graph.parse(str(graph_file), format='turtle')
            print(f"  ‚úì Loaded knowledge graph from {graph_file.name}")
            
            # Get graph statistics
            stats = rag.get_graph_stats()
            print(f"  üìä Graph contains:")
            print(f"      - {stats['domain_concepts']} domain concepts")
            print(f"      - {stats['topic_nodes']} topic nodes")
            print(f"      - {stats['documents']} documents")
            print(f"      - {stats['chunks']} chunks")
            print(f"      - {stats['total_triples']} total triples\n")
            
            # Extract topics and concepts from graph
            print("üß† Extracting knowledge from graph...\n")
            topics = rag.get_graph_topics(top_k=5)
            concepts = rag.get_graph_concepts(top_k=20)
            
            if topics:
                print(f"  ‚úì Found {len(topics)} topic nodes:")
                for i, topic in enumerate(topics[:3], 1):
                    print(f"      {i}. {topic['label']}")
                    if topic.get('concepts'):
                        print(f"         Concepts: {', '.join(topic['concepts'][:3])}")
            
            if concepts:
                print(f"\n  ‚úì Found {len(concepts)} domain concepts:")
                print(f"      {', '.join(concepts[:10])}")
            
            print()
            
        except Exception as e:
            print(f"  ‚úó Error loading graph: {e}")
            print(f"  Falling back to document extraction mode\n")
            use_graph_mode = False

# Fall back to document extraction if not using graph
if not use_graph_mode and research_topic is None:
    print("üîç Document Mode: Loading documents from sources directory...\n")
    
    rag = VaultRAG(sources_dir=str(sources_dir), verbose=False)
    rag._load_documents()
    documents = rag.documents
    
    if not documents:
        print("  ‚ö†Ô∏è No documents found in sources directory.")
        print("  Using default topic.\n")
        research_topic = "EU Data Act and Linked Data governance frameworks"
    else:
        print(f"  ‚úì Loaded {len(documents)} document(s)")
        
        # Show document types
        doc_types = {}
        for doc in documents:
            ext = Path(doc.path).suffix
            doc_types[ext] = doc_types.get(ext, 0) + 1
        
        print(f"  Types: {', '.join([f'{count}x{ext}' for ext, count in doc_types.items()])}\n")
        
        # Extract research topic using WebDiscovery AI
        print("ü§ñ Extracting research topic with AI...\n")
        
        # Combine first 3 documents (up to 2000 chars each)
        combined_content = "\n\n---\n\n".join([
            f"Document {i+1}: {doc.title}\n{doc.content[:2000]}"
            for i, doc in enumerate(documents[:3])
        ])
        
        # Use WebDiscovery's research topic extraction
        research_topic = discovery.extract_research_topic(combined_content, max_words=15)
        
        print(f"  ‚úì Extracted topic")

# Display summary
print("\n" + "="*80)
if use_graph_mode:
    print("üìå MODE: Knowledge Graph-Guided Discovery")
    print(f"   Using {len(topics)} topics and {len(concepts)} concepts from your refined graph")
else:
    print("üìå MODE: Document-Based Discovery")
    print(f"üéØ Research Topic: {research_topic}")
    print(f"   Length: {len(research_topic.split())} words")
print("="*80)

üîç Loading documents from sources directory...

  ‚úì Loaded 26 document(s)
  Types: 8x.md, 4x.txt, 12x.pdf, 2x.html

ü§ñ Extracting research topic with AI...

  ‚úì Loaded 26 document(s)
  Types: 8x.md, 4x.txt, 12x.pdf, 2x.html

ü§ñ Extracting research topic with AI...

  ‚úì Extracted topic

üéØ Research Topic:
   The impact of the EU Data Act on transitioning to Linked Data platforms for data accessibility.

   Length: 16 words (target: <15 words)
  ‚úì Extracted topic

üéØ Research Topic:
   The impact of the EU Data Act on transitioning to Linked Data platforms for data accessibility.

   Length: 16 words (target: <15 words)


## Step 2: Generate Search Queries

AI will generate optimized search queries for your topic.

In [None]:
# Automated search with improved query generation
print("üîç Generating search queries...\n")

# Load existing sources to avoid duplicates
print("üìÇ Checking existing sources...")
existing_urls = set()

for md_file in sources_dir.glob('*.md'):
    try:
        with open(md_file, 'r', encoding='utf-8') as f:
            content = f.read()
            import re
            url_match = re.search(r'url:\s*(.+)', content)
            if url_match:
                existing_urls.add(url_match.group(1).strip())
    except:
        pass

print(f"  Found {len(existing_urls)} existing sources to skip\n")

# Generate search queries based on mode
if use_graph_mode:
    print("üß† Generating queries from knowledge graph concepts...")
    queries = discovery.generate_queries_from_graph_concepts(
        topics=topics,
        concepts=concepts,
        num_queries=5
    )
else:
    print("üìù Generating queries from research topic...")
    queries = discovery._generate_search_queries(research_topic)

print(f"\n  ‚úì Generated {len(queries)} targeted queries:\n")
for i, q in enumerate(queries, 1):
    print(f"    {i}. {q}")

print("\n" + "="*80)
print("üí° NEXT STEPS:")
print("   1. Copy each query above")
print("   2. Search on Google Scholar, arXiv, or your preferred source")
print("   3. Copy relevant article URLs")
print("   4. Paste URLs in the next cell")
print("="*80)

üîç Searching for articles automatically...

üìÇ Checking existing sources...
  Found 1 existing sources to skip

üìù Generating targeted search queries...
  Generated 5 queries:

    1. "EU Data Act impact on Linked Data platforms for data accessibility"
    2. "transitioning to Linked Data platforms under EU Data Act regulations"
    3. "effects of EU Data Act on data accessibility in Linked Data environments"
    4. "EU Data Act implications for Linked Data implementation and data sharing"
    5. "challenges of adopting Linked Data platforms post EU Data Act enactment"

[1/5] Searching: "EU Data Act impact on Linked Data platforms for data accessibility"...
  Generated 5 queries:

    1. "EU Data Act impact on Linked Data platforms for data accessibility"
    2. "transitioning to Linked Data platforms under EU Data Act regulations"
    3. "effects of EU Data Act on data accessibility in Linked Data environments"
    4. "EU Data Act implications for Linked Data implementation and 

In [12]:
for result in all_results:
    print(result['title'])

Understanding switching rights under the Data Act
Simpler EU digital rules and new digital wallets to save ...
EU Data Act Significant New Switching Requirements Due ...
The impact of the EU Data Act on data processing services ...
EU Data Act operational impacts: Balancing risks and ...
Data Act explained | Shaping Europe's digital future
Data Act | Shaping Europe's digital future - European Union
EU Data Act: A new era for data sharing has begun


## Step 3: Select Articles to Extract

Choose which search results to download and process.

In [None]:
# Select articles to extract
# Options:
#   1. Extract all: selected_indices = list(range(1, len(all_results) + 1))
#   2. Extract first N: selected_indices = list(range(1, 6))
#   3. Select specific: selected_indices = [1, 3, 5, 7, 10]

if len(all_results) == 0:
    print("‚ö†Ô∏è No search results available.")
    print("   Run Step 2 first, or add manual URLs below.")
    selected_indices = []
else:
    # Default: First 5 results
    selected_indices = list(range(1, min(6, len(all_results) + 1)))
    
    print(f"üìã Selected {len(selected_indices)} articles:\n")
    for i in selected_indices:
        if i <= len(all_results):
            result = all_results[i-1]
            print(f"[{i}] {result['title'][:70]}...")
            print(f"    {result['url'][:75]}...")
            print()

# Add manual URLs if needed
manual_urls = []
"""
# Uncomment to add specific URLs not in search results:
manual_urls = [
    "https://example.com/article1",
    "https://example.com/article2",
]
"""

if selected_indices:
    print(f"üí° Edit selected_indices above to change selection")
if manual_urls:
    print(f"   + {len(manual_urls)} manual URLs will be added")

In [None]:
# Build final URL list from selected results + manual URLs
urls = []

# Add selected search results
for i in selected_indices:
    if i <= len(all_results):
        urls.append(all_results[i-1]['url'])

# Add manual URLs if any
urls.extend(manual_urls)

# Remove duplicates while preserving order
seen = set()
unique_urls = []
for url in urls:
    if url not in seen and url not in existing_urls:
        seen.add(url)
        unique_urls.append(url)

urls = unique_urls

if len(urls) == 0:
    print("‚ö†Ô∏è No URLs selected. Run Step 3 to select articles.")
else:
    print(f"‚úì Ready to extract {len(urls)} articles:\n")
    for i, url in enumerate(urls, 1):
        print(f"{i}. {url[:80]}...")
    print(f"\nüí° Proceed to Step 4 to extract content")

## Step 4: Extract and Save Articles

Download article content, assess quality, and save high-quality sources.

In [None]:
# Extract, assess, and auto-save high-quality articles
import re
from datetime import datetime

if len(urls) == 0:
    print("‚ö†Ô∏è No URLs to process. Run Steps 2-3 first.")
else:
    print(f"üîç Extracting {len(urls)} articles...\n")
    
    saved_count = 0
    skipped_count = 0
    failed_count = 0
    
    for i, url in enumerate(urls, 1):
        print(f"[{i}/{len(urls)}] {url[:65]}...")
        
        try:
            # Extract article content
            article = discovery.extract_article(url)
            
            if not article:
                print(f"  ‚úó Extraction failed")
                failed_count += 1
                continue
            
            # Assess quality with AI
            assessment = discovery.assess_quality(article)
            quality_score = assessment.get('quality_score', 0)
            
            print(f"  üìÑ {article['title'][:60]}...")
            print(f"  üë§ {article.get('author', 'Unknown')}")
            print(f"  üìè {len(article['content'])} chars")
            print(f"  ‚≠ê Quality: {quality_score}/10")
            
            # Save if quality meets threshold (‚â•6)
            if quality_score >= 6:
                # Create safe filename
                safe_title = re.sub(r'[^\w\s-]', '', article['title'])
                safe_title = re.sub(r'[-\s]+', '-', safe_title)[:100]
                filename = f"{safe_title}.md"
                filepath = sources_dir / filename
                
                # Build markdown with frontmatter
                content = f"""---
title: {article['title']}
author: {article.get('author', 'Unknown')}
url: {article['url']}
date_extracted: {datetime.now().strftime('%Y-%m-%d')}
quality_score: {quality_score}
tags: [web-article, research, {research_topic.lower().replace(' ', '-')[:30]}]
---

# {article['title']}

**Author:** {article.get('author', 'Unknown')}  
**Source:** {article['url']}  
**Extracted:** {datetime.now().strftime('%Y-%m-%d')}  
**Quality:** {quality_score}/10

---

{article['content']}
"""
                
                # Write to file
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(content)
                
                print(f"  ‚úì Saved: {filename}")
                saved_count += 1
            else:
                print(f"  ‚äò Skipped: Quality {quality_score} < 6")
                skipped_count += 1
        
        except Exception as e:
            print(f"  ‚úó Error: {str(e)[:60]}...")
            failed_count += 1
        
        print()
    
    # Summary
    print("=" * 70)
    print(f"‚úÖ Extraction Complete\n")
    print(f"   Saved:   {saved_count} high-quality articles (‚â•6/10)")
    print(f"   Skipped: {skipped_count} low-quality articles (<6/10)")
    print(f"   Failed:  {failed_count} extraction errors")
    print(f"   Total:   {len(urls)} articles processed")
    print(f"\n   Location: {sources_dir.resolve()}")
    
    if saved_count > 0:
        print(f"\nüîÑ Next Steps:")
        print(f"   1. Build knowledge graph:")
        print(f"      python build_graph.py")
        print(f"   2. Start chat interface:")
        print(f"      python server.py")
        print(f"   3. Generate synthesis article:")
        print(f"      python generate_article_from_graph.py data/graphs/knowledge_graph.ttl")
    else:
        print(f"\nüí° No articles saved. Try:")
        print(f"   ‚Ä¢ Lower quality threshold in code (change 'if quality_score >= 6')")
        print(f"   ‚Ä¢ Select different search results")
        print(f"   ‚Ä¢ Adjust research topic to be more specific")