In [None]:
# Setup and imports
import sys
sys.path.insert(0, '..')

from core.web_discovery import WebDiscovery
from features.research_agent import ResearchAgent
from pathlib import Path
from dotenv import load_dotenv
from datetime import datetime
import re

load_dotenv()

# Initialize components
discovery = WebDiscovery()
research_agent = ResearchAgent()
sources_dir = Path('../data/sources')
sources_dir.mkdir(parents=True, exist_ok=True)

print("‚úì Research workflow initialized")

## Step 1: Define Research Question

What question are you trying to answer with this research?

In [None]:
# Your research question
research_question = "How do knowledge graphs improve data integration in cloud environments?"

print(f"Research Question: {research_question}")

## Step 2: Batch URL Input

Paste all your source URLs here (one per line).

In [None]:
# Paste all URLs here
urls_text = """
https://example.com/source1
https://example.com/source2
https://example.com/source3
https://example.com/source4
https://example.com/source5
"""

# Parse URLs
urls = [url.strip() for url in urls_text.strip().split('\n') if url.strip() and url.startswith('http')]

print(f"\nüìã Processing {len(urls)} URLs")
for i, url in enumerate(urls, 1):
    print(f"  {i}. {url}")

## Step 3: Scrape and Assess Quality

Extract content and assess quality for each source.

In [None]:
# Scrape URLs using ResearchAgent
print("\nüîç Scraping sources...\n")

scraped_sources = research_agent.scrape_urls(urls)

print(f"\n‚úì Successfully scraped {len(scraped_sources)} sources")
print(f"‚úó Failed to scrape {len(urls) - len(scraped_sources)} sources")

In [None]:
# Assess quality of each source
print("\nüìä Assessing source quality...\n")

assessed_sources = []

for source in scraped_sources:
    print(f"Assessing: {source['title'][:60]}...")
    
    assessment = research_agent.assess_quality(source, research_question)
    source['assessment'] = assessment
    
    # Parse quality score from assessment
    score_match = re.search(r'Quality Score:\s*(\d+)/10', assessment.get('assessment', ''))
    source['quality_score'] = int(score_match.group(1)) if score_match else 0
    
    assessed_sources.append(source)
    print(f"  Score: {source['quality_score']}/10")
    print()

print(f"‚úì Assessed {len(assessed_sources)} sources")

## Step 4: Filter by Quality

Keep only high-quality sources (score >= 7).

In [None]:
# Filter by quality threshold
quality_threshold = 7

high_quality_sources = [s for s in assessed_sources if s['quality_score'] >= quality_threshold]

print(f"\nüìà Quality Analysis:")
print(f"  Total sources: {len(assessed_sources)}")
print(f"  High quality (>= {quality_threshold}): {len(high_quality_sources)}")
print(f"  Low quality (< {quality_threshold}): {len(assessed_sources) - len(high_quality_sources)}")

print("\nüåü High Quality Sources:")
for source in high_quality_sources:
    print(f"  [{source['quality_score']}/10] {source['title']}")
    print(f"           {source['url']}")

## Step 5: Synthesize Sources

Create a synthesis of all high-quality sources.

In [None]:
# Synthesize sources
print("\nüß† Synthesizing sources...\n")

synthesis = research_agent.synthesize_sources(
    sources=high_quality_sources,
    research_question=research_question
)

print("="*80)
print("RESEARCH SYNTHESIS")
print("="*80)
print(synthesis)
print("="*80)

## Step 6: Save Individual Sources

Save each high-quality source as a markdown file.

In [None]:
# Save individual sources
print("\nüíæ Saving sources...\n")

saved_files = []

for source in high_quality_sources:
    # Create safe filename
    safe_title = re.sub(r'[^\w\s-]', '', source['title'])
    safe_title = re.sub(r'[-\s]+', '-', safe_title)[:100]
    filename = f"{safe_title}.md"
    filepath = sources_dir / filename
    
    # Extract key topics from research question
    topics = research_question.lower().split()
    tags = [t for t in topics if len(t) > 3][:3]  # Top 3 meaningful words
    
    # Create markdown content
    content = f"""---
title: {source['title']}
author: {source.get('author', 'Unknown')}
url: {source['url']}
quality_score: {source['quality_score']}/10
date_extracted: {datetime.now().strftime('%Y-%m-%d')}
research_question: {research_question}
tags: [research, high-quality, {', '.join(tags)}]
---

# {source['title']}

**Author:** {source.get('author', 'Unknown')}  
**Source:** {source['url']}  
**Quality Score:** {source['quality_score']}/10  
**Extracted:** {datetime.now().strftime('%Y-%m-%d')}

## Quality Assessment

{source['assessment'].get('assessment', 'No assessment available')}

---

## Content

{source['content']}
"""
    
    # Write file
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)
    
    saved_files.append(filepath)
    print(f"  ‚úì {filename}")

print(f"\n‚úì Saved {len(saved_files)} sources")

## Step 7: Create Literature Note

Create a synthesis note linking all sources.

In [None]:
# Create literature note with synthesis
lit_note_filename = f"Literature-{datetime.now().strftime('%Y%m%d-%H%M%S')}.md"
lit_note_path = sources_dir / lit_note_filename

# Build links to sources
source_links = "\n".join([
    f"- [[{source['title']}]] - Quality: {source['quality_score']}/10"
    for source in high_quality_sources
])

lit_note_content = f"""---
title: Literature Review - {research_question}
type: literature-note
date_created: {datetime.now().strftime('%Y-%m-%d')}
num_sources: {len(high_quality_sources)}
tags: [literature-review, synthesis]
---

# Literature Review: {research_question}

**Date:** {datetime.now().strftime('%Y-%m-%d')}  
**Sources:** {len(high_quality_sources)} high-quality articles

## Research Question

{research_question}

## Synthesis

{synthesis}

## Sources

{source_links}

## Methodology

- Total URLs processed: {len(urls)}
- Successfully scraped: {len(scraped_sources)}
- Quality threshold: {quality_threshold}/10
- High-quality sources: {len(high_quality_sources)}
- Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}
"""

# Save literature note
with open(lit_note_path, 'w', encoding='utf-8') as f:
    f.write(lit_note_content)

print(f"\nüìù Created literature note: {lit_note_filename}")
print(f"\n‚úÖ Research workflow complete!")
print(f"\nüìä Summary:")
print(f"   - Processed: {len(urls)} URLs")
print(f"   - Saved: {len(saved_files)} high-quality sources")
print(f"   - Created: 1 literature note with synthesis")
print(f"\nüîÑ Next steps:")
print(f"   1. Launch UI: python server.py")
print(f"   2. Build knowledge graph: see test_graph.py")
print(f"   3. Ask questions about your research!")