# Ingest News Articles for Scrantenna
This notebook pulls news articles from multiple sources and saves them for further processing.

In [ ]:
import requests
import json
import os
import spacy
import re
from datetime import datetime
from typing import List, Dict, Tuple

# Load SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except:
    print("SpaCy model not found. Install with: python -m spacy download en_core_web_sm")
    nlp = None

# Define constants
API_KEY = 'be93936988fd4df185bd56e8a11125a0'
QUERY = "Scranton"
DATA_DIR = "../data/daily"
NEWS_URL = f"https://newsapi.org/v2/everything?q={QUERY}&apiKey={API_KEY}"

def extract_svo_triplets(text: str) -> List[Tuple[str, str, str]]:
    """Extract Subject-Verb-Object triplets from text using SpaCy dependency parsing."""
    if not nlp or not text:
        return []
    
    doc = nlp(text)
    triplets = []
    
    for sent in doc.sents:
        for token in sent:
            if token.pos_ == "VERB" and token.dep_ == "ROOT":
                verb = token.text
                subject = None
                obj = None
                
                for child in token.children:
                    if child.dep_ in ["nsubj", "nsubjpass"]:
                        subject_phrase = [child.text]
                        for subchild in child.children:
                            if subchild.dep_ in ["compound", "amod", "det"]:
                                subject_phrase.insert(0, subchild.text)
                        subject = " ".join(subject_phrase)
                    
                    elif child.dep_ in ["dobj", "pobj", "attr"]:
                        obj_phrase = [child.text]
                        for subchild in child.children:
                            if subchild.dep_ in ["compound", "amod", "det"]:
                                obj_phrase.insert(0, subchild.text)
                        obj = " ".join(obj_phrase)
                
                if subject and verb:
                    triplets.append((subject, verb, obj or "[no object]"))
    
    return triplets

def simplify_to_svo(text: str) -> str:
    """Convert text to boring, precise SVO statements."""
    if not text:
        return ""
    
    triplets = extract_svo_triplets(text)
    
    if not triplets:
        if nlp:
            doc = nlp(text)
            entities = [(ent.text, ent.label_) for ent in doc.ents]
            if entities:
                return f"Article mentions {', '.join([f'{e[0]} ({e[1]})' for e in entities[:3]])}"
        return "No clear statements extracted."
    
    svo_sentences = []
    for subj, verb, obj in triplets[:5]:
        sentence = f"{subj.capitalize()} {verb} {obj}."
        svo_sentences.append(sentence)
    
    return " ".join(svo_sentences)

def process_article(article: Dict) -> Dict:
    """Process article to include SVO versions alongside original text."""
    processed = article.copy()
    
    # Add SVO versions if SpaCy is available
    if nlp:
        if article.get('title'):
            processed['title_svo'] = simplify_to_svo(article['title'])
        
        if article.get('description'):
            processed['description_svo'] = simplify_to_svo(article['description'])
        
        if article.get('content'):
            clean_content = re.sub(r'\[\+\d+ chars\]', '', article['content'])
            processed['content_svo'] = simplify_to_svo(clean_content)
    
    return processed

def fetch_news():
    """Fetch news articles from NewsAPI"""
    response = requests.get(NEWS_URL)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch news articles: {response.text}")
    return response.json()

def save_news(news_data):
    """Save news articles with both original and SVO formats"""
    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)
    
    # Process articles to add SVO versions
    processed_articles = []
    for article in news_data.get('articles', []):
        processed_articles.append(process_article(article))
    
    # Create data structure with metadata
    output_data = {
        "query": QUERY,
        "fetched_at": datetime.now().isoformat(),
        "total_articles": len(processed_articles),
        "has_svo": nlp is not None,
        "articles": processed_articles
    }
    
    # Save with date-based filename
    file_path = os.path.join(DATA_DIR, f"scranton_news_{datetime.now().strftime('%Y-%m-%d')}.json")
    with open(file_path, 'w') as f:
        json.dump(output_data, f, indent=2)
    
    print(f"Saved {len(processed_articles)} articles to {file_path}")
    if nlp:
        print("✓ SVO versions included")
    else:
        print("⚠ SVO versions not generated (SpaCy not available)")
    
    return file_path

# Fetch and save articles with SVO processing
news_data = fetch_news()
saved_file = save_news(news_data)

In [ ]:
# Display sample articles with toggle between original and SVO
def display_sample_articles(file_path, num_samples=3):
    """Display sample articles showing both original and SVO versions"""
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    print(f"\n{'='*80}")
    print(f"Sample Articles from {file_path}")
    print(f"{'='*80}\n")
    
    for i, article in enumerate(data['articles'][:num_samples]):
        print(f"Article {i+1}:")
        print(f"Source: {article.get('source', {}).get('name', 'Unknown')}")
        print(f"\nOriginal Title: {article.get('title', 'N/A')}")
        print(f"SVO Title: {article.get('title_svo', 'N/A')}")
        print(f"\nOriginal Description: {article.get('description', 'N/A')[:150]}...")
        print(f"SVO Description: {article.get('description_svo', 'N/A')}")
        print(f"{'-'*80}\n")

# Show samples from the saved file
if 'saved_file' in locals():
    display_sample_articles(saved_file)