In [None]:
# Setup and imports
import sys
sys.path.insert(0, '..')

from core.web_discovery import WebDiscovery
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

# Initialize web discovery
discovery = WebDiscovery()
sources_dir = Path('../data/sources')
sources_dir.mkdir(parents=True, exist_ok=True)

print("‚úì Web Discovery initialized")

## Step 1: Enter Research Topic

What topic are you researching?

In [None]:
# Enter your research topic here
research_topic = "Knowledge graphs and semantic web technologies"

print(f"Research topic: {research_topic}")

## Step 2: Generate Search Queries

AI will generate optimized search queries for your topic.

In [None]:
# Generate search queries
queries = discovery._generate_search_queries(research_topic)

print("\nüìù Generated Search Queries:\n")
for i, query in enumerate(queries, 1):
    print(f"{i}. {query}")

print("\nüí° Copy these queries and search on:")
print("   - Google Scholar")
print("   - arXiv")
print("   - Medium")
print("   - Academic journals")

## Step 3: Paste URLs

After searching, paste the URLs you want to extract (one per line).

In [None]:
# Paste your URLs here (one per line)
urls_text = """
https://example.com/article1
https://example.com/article2
https://example.com/article3
"""

# Parse URLs
urls = [url.strip() for url in urls_text.strip().split('\n') if url.strip()]

print(f"\nüìã Found {len(urls)} URLs to process")
for i, url in enumerate(urls, 1):
    print(f"{i}. {url}")

## Step 4: Extract Articles

Extract content from URLs and preview quality.

In [None]:
# Extract articles
articles = []

print("\nüîç Extracting articles...\n")

for i, url in enumerate(urls, 1):
    print(f"[{i}/{len(urls)}] Processing: {url}")
    
    try:
        article = discovery.extract_article(url)
        
        if article:
            # Get quality assessment
            assessment = discovery.assess_quality(article)
            article['assessment'] = assessment
            articles.append(article)
            
            print(f"  ‚úì {article['title']}")
            print(f"    Author: {article.get('author', 'Unknown')}")
            print(f"    Length: {len(article['content'])} chars")
            print(f"    Quality: {assessment.get('quality_score', 'N/A')}")
        else:
            print(f"  ‚úó Could not extract article")
    
    except Exception as e:
        print(f"  ‚úó Error: {str(e)}")
    
    print()

print(f"\n‚úì Successfully extracted {len(articles)} articles")

## Step 5: Review and Filter

Review extracted articles and select which ones to save.

In [None]:
# Display articles with indices
print("\nüìö Extracted Articles:\n")

for i, article in enumerate(articles):
    print(f"[{i}] {article['title']}")
    print(f"    URL: {article['url']}")
    print(f"    Author: {article.get('author', 'Unknown')}")
    print(f"    Length: {len(article['content'])} characters")
    
    # Show assessment if available
    if 'assessment' in article:
        assessment = article['assessment']
        print(f"    Quality: {assessment.get('quality_score', 'N/A')}")
    
    print()

print("\nüí° To save specific articles, set indices_to_save below.")
print("   Example: indices_to_save = [0, 1, 3]  # Save articles 0, 1, and 3")
print("   Or leave empty to save all: indices_to_save = []")

In [None]:
# Select which articles to save (empty list = save all)
indices_to_save = []  # Change this to select specific articles

# If empty, save all
if not indices_to_save:
    articles_to_save = articles
else:
    articles_to_save = [articles[i] for i in indices_to_save if i < len(articles)]

print(f"Selected {len(articles_to_save)} articles to save")

## Step 6: Save to Sources Directory

Save selected articles as markdown files in `data/sources/`.

In [None]:
# Save articles
import re
from datetime import datetime

saved_files = []

print("\nüíæ Saving articles...\n")

for article in articles_to_save:
    # Create safe filename
    safe_title = re.sub(r'[^\w\s-]', '', article['title'])
    safe_title = re.sub(r'[-\s]+', '-', safe_title)[:100]
    filename = f"{safe_title}.md"
    filepath = sources_dir / filename
    
    # Create markdown content with frontmatter
    content = f"""---
title: {article['title']}
author: {article.get('author', 'Unknown')}
url: {article['url']}
date_extracted: {datetime.now().strftime('%Y-%m-%d')}
tags: [web-article, {research_topic.lower().replace(' ', '-')}]
---

# {article['title']}

**Author:** {article.get('author', 'Unknown')}  
**Source:** {article['url']}  
**Extracted:** {datetime.now().strftime('%Y-%m-%d')}

---

{article['content']}
"""
    
    # Write file
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)
    
    saved_files.append(filepath)
    print(f"  ‚úì Saved: {filename}")

print(f"\n‚úÖ Successfully saved {len(saved_files)} articles to data/sources/")
print("\nüîÑ Next steps:")
print("   1. Launch the UI: python server.py")
print("   2. Ask questions about your new sources!")